In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, learning_curve,cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt
import joblib

In [2]:


#  Data Load & Preprocess

df = pd.read_csv("hour.csv")
df['dteday'] = pd.to_datetime(df["dteday"])

# Extract Date Features
df['year'] = df['dteday'].dt.year
df['month'] = df['dteday'].dt.month
df['day'] = df['dteday'].dt.day
df['dayofweek'] = df['dteday'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x>=5 else 0)

# Interaction Features
df['temp_hum'] = df['temp'] * df['hum']
df['wind_season'] = df['windspeed'] * df['season']

# Target Transformation (Log)
df['cnt_log'] = np.log1p(df['cnt'])

# Drop Unnecessary Columns
df.drop(['instant', 'dteday', 'casual', 'registered', 'cnt'], axis=1, inplace=True)

In [3]:
df.head(5)

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,year,month,day,dayofweek,is_weekend,temp_hum,wind_season,cnt_log
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,2011,1,1,5,1,0.1944,0.0,2.833213
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,2011,1,1,5,1,0.176,0.0,3.713572
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,2011,1,1,5,1,0.176,0.0,3.496508
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,2011,1,1,5,1,0.18,0.0,2.639057
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,2011,1,1,5,1,0.18,0.0,0.693147


In [4]:

# Train-Test Split & Scaling

X = df.drop('cnt_log', axis=1)
y = df['cnt_log']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling (Optional for GBM, Required for Linear)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
models ={
    "RandomForestRegressor":RandomForestRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(),
    "LinearRegression":LinearRegression(),
    "ExtraTreeRegressor":ExtraTreeRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor()
}
best_score = 0
best_model_name =None

In [6]:
param_distribution = {
    "RandomForestRegressor": {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt'],
        'bootstrap': [True, False]
    },
    "GradientBoostingRegressor": {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 1.0],
        'max_features': ['auto', 'sqrt']
    },
    "LinearRegression": {
        # LinearRegression has very few tunable hyperparameters
        'fit_intercept': [True, False],
        'positive': [True, False]
    },
    "ExtraTreeRegressor": {
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    },
    "DecisionTreeRegressor": {
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }
}

In [7]:
for name, model in models.items():
    print(f"Tuning Hyperparameters for {name}...")

    # Hyperparameter tuning using RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distribution[name],
        n_iter=10,
        scoring="r2",
        cv=5,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train_scaled, y_train)

    best_model = random_search.best_estimator_

    print(f"Training {name} with Best Parameters: {random_search.best_params_}")

    # Cross Validation with best model
    cross_val = cross_val_score(best_model, X_train_scaled, y_train, scoring="r2", cv=5, n_jobs=-1, verbose=0)
    print(f"Cross Validation R2 Scores: {cross_val}")
    print(f"Cross Validation Mean R2 Score: {cross_val.mean():.4f}")

    y_train_log = np.log1p(y_train)
    best_model.fit(X_train_scaled, y_train_log)
    y_pred_log = best_model.predict(X_test_scaled)
    y_pred = np.expm1(y_pred_log)

    # Evaluation Metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100 

    print(f"Test R2 Score: {r2:.4f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAPE: {mape:.2f}%")
    print("-------------------------------------------------")

    if cross_val.mean() > best_score:
        best_score = cross_val.mean()
        best_model_name = name

print(f"Best Cross Validation Score: {best_score:.4f}")
print(f"Best Model Name: {best_model_name}")

Tuning Hyperparameters for RandomForestRegressor...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Training RandomForestRegressor with Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
Cross Validation R2 Scores: [0.89920003 0.90464803 0.90600279 0.89916367 0.90666167]
Cross Validation Mean R2 Score: 0.9031
Test R2 Score: 0.9095
Test MAE: 0.32
Test RMSE: 0.43
Test MAPE: 10.28%
-------------------------------------------------
Tuning Hyperparameters for GradientBoostingRegressor...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Training GradientBoostingRegressor with Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.05}
Cross Validation R2 Scores: [0.94357302 0.94696446 0.94805175 0.94700425 0.94881717]
Cross Validation Mean R2 Score: 0.9469
Test R2 

In [8]:

joblib.dump(best_model, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']