In [24]:
# Day 8: Hyperparameter Tuning with GridSearch + Optuna

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import optuna
import joblib  # for saving models




In [26]:
data = pd.read_csv(r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis\data\processed\ev_merged_cleaned.csv")

print("Dataset shape:", data.shape)
data.head()

# Features & Target
X = data.drop(columns=["ev_share"])
y = data["ev_share"]

# Train/Test split (same as Day 7 for consistency)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Dataset shape: (492, 8)
Train shape: (393, 7) Test shape: (99, 7)


In [27]:
# K-Fold cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
print("Using 5-Fold cross-validation for reliable evaluation.")


Using 5-Fold cross-validation for reliable evaluation.


In [30]:
# Drop text columns that can't be used directly in ML models
X = data.drop(columns=["ev_share", "country", "iso_code"])

# Target
y = data["ev_share"]

# Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Features used:", X_train.columns.tolist())

Train shape: (393, 5) Test shape: (99, 5)
Features used: ['year', 'population', 'total_chargers', 'charger_density_per_100k', 'population_million']


In [31]:
# Cell 4: GridSearch for Random Forest

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}

# Initialize Random Forest
rf = RandomForestRegressor(random_state=42)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                 # 5-fold cross-validation
    n_jobs=-1,            # use all CPU cores
    scoring="r2",         # R² score for regression
    verbose=2
)

# Fit on training data
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best R2 Score: 0.7174625610221124


In [32]:
# Get best estimator
best_rf = grid_search.best_estimator_

y_pred_rf = best_rf.predict(X_test)

print("Random Forest Test R2:", r2_score(y_test, y_pred_rf))
print("Random Forest Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))


Random Forest Test R2: 0.4493579366669028
Random Forest Test RMSE: 10.164496519513278


In [33]:
from xgboost import XGBRegressor

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }
    
    model = XGBRegressor(**params, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring="r2", n_jobs=-1)
    return scores.mean()

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best XGB Params:", study.best_params)
print("Best XGB CV Score:", study.best_value)


[I 2025-08-23 09:57:21,033] A new study created in memory with name: no-name-cf2c1cc3-7f5b-4b5a-98b9-86bf15cbf2e5
[I 2025-08-23 09:57:24,904] Trial 0 finished with value: 0.5688199539687715 and parameters: {'n_estimators': 391, 'max_depth': 5, 'learning_rate': 0.04407900970947291, 'subsample': 0.5414041626922883, 'colsample_bytree': 0.9244204173348898}. Best is trial 0 with value: 0.5688199539687715.
[I 2025-08-23 09:57:26,423] Trial 1 finished with value: 0.41504849817875566 and parameters: {'n_estimators': 283, 'max_depth': 3, 'learning_rate': 0.011318579217678941, 'subsample': 0.5503326179649952, 'colsample_bytree': 0.5387266662116148}. Best is trial 0 with value: 0.5688199539687715.
[I 2025-08-23 09:57:30,060] Trial 2 finished with value: 0.30058923414916305 and parameters: {'n_estimators': 487, 'max_depth': 11, 'learning_rate': 0.1227226687341205, 'subsample': 0.9509812798709094, 'colsample_bytree': 0.5727014410346902}. Best is trial 0 with value: 0.5688199539687715.
[I 2025-08-23

Best XGB Params: {'n_estimators': 499, 'max_depth': 6, 'learning_rate': 0.24592410416690502, 'subsample': 0.8017809701379595, 'colsample_bytree': 0.9100402056216096}
Best XGB CV Score: 0.6103579267679062


In [34]:
# Train XGB with best params
best_xgb = XGBRegressor(**study.best_params, random_state=42)
best_xgb.fit(X_train, y_train)

y_pred_xgb = best_xgb.predict(X_test)

print("XGBoost Test R2:", r2_score(y_test, y_pred_xgb))
print("XGBoost Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))


XGBoost Test R2: 0.546971568886504
XGBoost Test RMSE: 9.219638837108075


In [36]:
# Save tuned models
joblib.dump(best_rf, r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis/best_random_forest.pkl")
joblib.dump(best_xgb, r"C:\Users\Saket\OneDrive\Desktop\4th sem\python\EDA\EV_Market_Analysis/best_xgboost.pkl")

print("Models saved for Day 9 analysis!")


Models saved for Day 9 analysis!
