In [7]:
import os
import pandas as pd
import numpy as np
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import HalvingGridSearchCV

from sklearn import metrics   



sys.path.append('..')
from Workflow import Workflow

In [8]:
wf = Workflow()
Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest = wf.load_data()

In [9]:
def optimize_rf_model(X, y, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, seed=42):
    estimator = RandomForestRegressor(random_state=seed)  

    # Use HalvingGridSearchCV for hyperparameter optimization
    grid_search = HalvingGridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs,
        aggressive_elimination=True,
        verbose=3 
    )
    
    # Fit the model to the data
    grid_search.fit(X, y)
    
    # Output the results
    print(f'Best parameters found: {grid_search.best_params_}')
    print(f'Best score: {grid_search.best_score_}')
    
    return grid_search.best_estimator_

In [11]:
param_grid = {
    'n_estimators': [70, 90, 110], 
    'max_depth': [40, 50, 60],
    'max_features': ['log2'],
    'min_samples_leaf': [4, 6],
    'min_samples_split': [4, 6],
    'bootstrap': [False,True], 
}


# Use the optimize_rf_model function to find the best model
seed = 42
best_model = optimize_rf_model(Xtrain, ytrain, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1,seed=seed)

# Evaluate the best model on the training set
y_train_pred = best_model.predict(Xtrain)
train_r2 = metrics.r2_score(ytrain, y_train_pred)

# Evaluate the best model on the validation set
y_valid_pred = best_model.predict(Xvalid)
validation_rmse = np.sqrt(metrics.mean_squared_error(yvalid, y_valid_pred))
validation_r2 = metrics.r2_score(yvalid, y_valid_pred)

# Evaluate the best model on the test set
y_test_pred = best_model.predict(Xtest)
test_r2 = metrics.r2_score(ytest, y_test_pred)

# Print the results
print("Training R2 Score:", train_r2)
print("Validation RMSE:", validation_rmse)
print("Validation R2 Score:", validation_r2)
print("Test R2 Score:", test_r2)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 2920
max_resources_: 78854
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 72
n_resources: 2920
Fitting 5 folds for each of 72 candidates, totalling 360 fits
----------
iter: 1
n_candidates: 24
n_resources: 8760
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 8
n_resources: 26280
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 3
n_candidates: 3
n_resources: 78840
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters found: {'bootstrap': False, 'max_depth': 60, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 4, 'n_estimators': 70}
Best score: -2766.5688881803485
Training R2 Score: 0.8468822648264323
Validation RMSE: 53.62286402599687
Validation R2 Score: 0.6996384523645774
Test R2 Score: 0.7347727623089177
