In [13]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv("data/hour.csv", parse_dates=["dteday"])
X = df.drop(columns=["cnt", "dteday"])
y = df["cnt"]

# Time-aware cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Parameter grids
rf_param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [6, 10, None],
    "max_features": ["sqrt", "log2", 0.3]
}
gb_param_grid = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 6, 10],
    "subsample": [0.7, 0.9, 1.0]
}
xgb_param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

# Evaluation helper
def evaluate_model(model_name, model, param_grid, X, y, tscv, results_path="results/tree_models_search.csv"):
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=10,
        scoring="neg_mean_absolute_error",
        cv=tscv,
        verbose=2,
        n_jobs=-1,
        random_state=42
    )

    start_time = time.time()
    search.fit(X, y)
    train_time = time.time() - start_time

    best_model = search.best_estimator_

    # Use last validation split for evaluation
    for train_idx, valid_idx in tscv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    y_pred = best_model.predict(X_valid)

    mae = mean_absolute_error(y_valid, y_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    r2 = r2_score(y_valid, y_pred)
    n, p = len(y_valid), X_valid.shape[1]
    adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    result = pd.DataFrame([{
        "model": model_name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "Adjusted_R2": adj_r2,
        "Best_Params": search.best_params_,
        "Train_Time": train_time
    }])

    # Save results
    try:
        old_results = pd.read_csv(results_path)
        final_results = pd.concat([old_results, result], ignore_index=True)
    except FileNotFoundError:
        final_results = result

    final_results.to_csv(results_path, index=False)
    print(f"\n✅ {model_name} results saved to {results_path}")
    return result

# Run models
rf_results = evaluate_model("RandomForest", RandomForestRegressor(random_state=42), rf_param_grid, X, y, tscv)
gb_results = evaluate_model("GradientBoosting", GradientBoostingRegressor(random_state=42), gb_param_grid, X, y, tscv)
xgb_results = evaluate_model("XGBoost", XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=-1), xgb_param_grid, X, y, tscv)

# Final Results
print("\n📊 Final Results:")
print(pd.concat([rf_results, gb_results, xgb_results], ignore_index=True))


Fitting 5 folds for each of 10 candidates, totalling 50 fits

✅ RandomForest results saved to results/tree_models_search.csv
Fitting 5 folds for each of 10 candidates, totalling 50 fits

✅ GradientBoosting results saved to results/tree_models_search.csv
Fitting 5 folds for each of 10 candidates, totalling 50 fits

✅ XGBoost results saved to results/tree_models_search.csv

📊 Final Results:
              model       MAE      RMSE        R2  Adjusted_R2  \
0      RandomForest  2.659575  4.595428  0.999554     0.999552   
1  GradientBoosting  0.299457  0.403226  0.999997     0.999997   
2           XGBoost  0.054611  0.083225  1.000000     1.000000   

                                         Best_Params  Train_Time  
0  {'n_estimators': 100, 'max_features': 0.3, 'ma...   54.907524  
1  {'subsample': 0.9, 'n_estimators': 500, 'max_d...  145.326069  
2  {'subsample': 1.0, 'n_estimators': 500, 'max_d...   27.730578  
