# GridSearchCV - finding best hyperparameters

+ Comparisson of 3 different models to predict a numerical variable (CO2 emission)
+ Linear Regression 
+ RandomForestRegressor 
+ SVR 

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

# Initialize the regression models
clf_lr = LinearRegression()
clf_rf = RandomForestRegressor(random_state=22)
clf_svr = SVR()

# Define parameter grids for the regression models
param_grid_lr = {}  # Linear Regression doesn't need hyperparameters for grid search

param_grid_rf = [{'n_estimators': [10, 50, 100, 250, 500, 1000],
                  'min_samples_leaf': [1, 3, 5],
                  'max_features': ['sqrt', 'log2']}]

param_grid_svr = [{'kernel': ['rbf'], 'C': np.logspace(-4, 4, 9), 'gamma': np.logspace(-4, 0, 4)},
                  {'kernel': ['linear'], 'C': np.logspace(-4, 4, 9)}]

# Create empty dictionary for GridSearchCV objects
gridcvs = {}

# Loop through parameter grids and models to create GridSearchCV objects
for pgrid, clf, name in zip((param_grid_lr, param_grid_rf, param_grid_svr),
                            (clf_lr, clf_rf, clf_svr),
                            ('LinearRegression', 'RandomForest', 'SVR')):
    gcv = GridSearchCV(clf, pgrid, cv=3, refit=True)
    gridcvs[name] = gcv

# Outer cross-validation
outer_cv = KFold(n_splits=3, shuffle=True, random_state=22)
outer_scores = {}

# Assume X_train, y_train, X_test, y_test are defined elsewhere in the code
for name, gs in gridcvs.items():
    # Perform cross-validation and fit the model
    gs.fit(X_train, y_train)
    
    # Get the best parameters after fitting
    best_params = gs.best_params_ if gs.best_params_ else "Default parameters"
    print(f'{name} Best Parameters: {best_params}')
    
    # Calculate the mean squared error (MSE) on training and test data
    train_pred = gs.predict(X_train)
    test_pred = gs.predict(X_test)
    
    train_mse = mean_squared_error(y_true=y_train, y_pred=train_pred)
    test_mse = mean_squared_error(y_true=y_test, y_pred=test_pred)
    
    print(f'{name} Training MSE: {train_mse:.2f}')
    print(f'{name} Test MSE: {test_mse:.2f}')
    
    # Store the outer cross-validation score (neg_mean_squared_error)
    nested_score = cross_val_score(gs, X_train, y_train, cv=outer_cv, scoring='neg_mean_squared_error')
    outer_scores[name] = nested_score
    print(f'{name}: Outer MSE {(-nested_score.mean()):.2f} +/- {(-nested_score.std()):.2f}\n')
