In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
X = joblib.load("../models/X_processed.pkl")
y = joblib.load("../models/y.pkl")

print(X.shape, y.shape)


(1338, 11) (1338,)


In [3]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    return {
        "MAE": mean_absolute_error(y, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y, y_pred)),
        "R2": r2_score(y, y_pred)
    }


In [4]:
ridge = Ridge()

ridge_params = {
    "alpha": [0.1, 1.0, 10.0]
}


In [5]:
ridge_gs = GridSearchCV(
    ridge,
    ridge_params,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

ridge_gs.fit(X, y)


In [6]:
best_ridge = ridge_gs.best_estimator_

joblib.dump(best_ridge, "../models/ridge_optimized.pkl")

evaluate_model(best_ridge, X, y)


{'MAE': 4174.379715775466, 'RMSE': 6041.721698258853, 'R2': 0.7509095675437587}

In [7]:
rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1
)

rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10]
}


In [8]:
rf_gs = GridSearchCV(
    rf,
    rf_params,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

rf_gs.fit(X, y)


In [9]:
best_rf = rf_gs.best_estimator_

joblib.dump(best_rf, "../models/random_forest_optimized.pkl")

evaluate_model(best_rf, X, y)


{'MAE': 1223.0276613726282,
 'RMSE': 2234.473537189864,
 'R2': 0.9659289086191667}

In [10]:
gb = GradientBoostingRegressor(random_state=42)

gb_params = {
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3]
}


In [11]:
gb_gs = GridSearchCV(
    gb,
    gb_params,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

gb_gs.fit(X, y)


In [12]:
best_gb = gb_gs.best_estimator_

joblib.dump(best_gb, "../models/gradient_boosting_optimized.pkl")

evaluate_model(best_gb, X, y)


{'MAE': 2254.9823302312975,
 'RMSE': 4105.817794108148,
 'R2': 0.8849636856067044}

In [13]:
results = pd.DataFrame({
    "Model": ["Ridge Optimized", "Random Forest Optimized", "Gradient Boosting Optimized"],
    "R2": [
        evaluate_model(best_ridge, X, y)["R2"],
        evaluate_model(best_rf, X, y)["R2"],
        evaluate_model(best_gb, X, y)["R2"]
    ],
    "RMSE": [
        evaluate_model(best_ridge, X, y)["RMSE"],
        evaluate_model(best_rf, X, y)["RMSE"],
        evaluate_model(best_gb, X, y)["RMSE"]
    ]
})

results


Unnamed: 0,Model,R2,RMSE
0,Ridge Optimized,0.75091,6041.721698
1,Random Forest Optimized,0.965929,2234.473537
2,Gradient Boosting Optimized,0.884964,4105.817794
