# Regression models 

## Importing Libraries

In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, r_regression, RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Load train/test data

In [2]:
with open("processed_data/X_train_reg_linear.pkl", "rb") as f:
    X_train = pickle.load(f)
with open("processed_data/X_test_reg_linear.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("processed_data/y_train_reg_linear.pkl", "rb") as f:
    y_train = pickle.load(f)
with open("processed_data/y_test_reg_linear.pkl", "rb") as f:
    y_test = pickle.load(f)


## Scaling using StandardScaler

In [3]:
# Features to scale
col_to_scale = ['remainder__tenure']

scaler_reg = StandardScaler()
# Fit-transform on train, transform on test
X_train[col_to_scale] = scaler_reg.fit_transform(X_train[col_to_scale])
X_test[col_to_scale] = scaler_reg.transform(X_test[col_to_scale])

## Model Training including feature selection and with hyperparameter tuning

#### Evaluation function

In [4]:
def evaluate_regression(y_true, y_pred, model_name="model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name}\nMAE: {mae:.4f}\nRMSE: {rmse:.4f}\nR²: {r2:.4f}")
    return {"model": model_name, "mae": mae, "rmse": rmse, "r2": r2}

#### Feature Selection

In [5]:
# # Feature Selection using SelectKBest with percentile of F-scores
# selector = SelectKBest(score_func=f_regression, k='all')  # Fit with all features to get scores
# selector.fit(X_train, y_train)

# # Get F-scores and determine k based on top 75% percentile of scores
# f_scores = selector.scores_
# score_threshold = np.percentile(f_scores, 60)  # Top 75% percentile of F-scores
# k = np.sum(f_scores >= score_threshold)  # Number of features above the threshold
# selector.k = k  # Set k to the calculated number

# # Get selected features and their scores
# selected_features_mask = selector.get_support()
# selected_features = X_train.columns[selected_features_mask].tolist()
# feature_scores = selector.scores_[selected_features_mask]
# print(f"Number of selected features (based on 75th percentile of F-scores): {k}")
# print("Selected features and their F-scores:")
# for name, score in zip(selected_features, feature_scores):
#     print(f"- {name}: {score:.4f}")

In [6]:
# # Transform training and test sets with selected features
# X_train_selected = selector.transform(X_train)
# X_test_selected = selector.transform(X_test)

# # Debug: Print selected feature count
# print(f"Selected number of features: {X_train_selected.shape[1]}")

### 1. Linear Regression model

In [7]:
# train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

In [8]:
# evaluate
y_pred = lr.predict(X_test)
lr_results = evaluate_regression(y_test, y_pred, model_name="LinearRegression")

LinearRegression
MAE: 23.3569
RMSE: 28.1714
R²: -0.0116


#### 1.1 Polynomial + Linear Regression

In [9]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly  = poly.transform(X_test)

In [10]:
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)

In [11]:
y_pred_poly = lr_poly.predict(X_test_poly)
poly_results = evaluate_regression(y_test, y_pred_poly, "Poly(d=2)+LinearRegression")

Poly(d=2)+LinearRegression
MAE: 23.9043
RMSE: 28.9431
R²: -0.0678


#### 1.2 Ridge Regression with Hyperparameter Tuning

In [12]:
# 3. Ridge Regression with Hyperparameter Tuning
ridge = Ridge()
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0, 100.0]}  # Range of alpha values to test
grid_search_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='r2', n_jobs=-1)
grid_search_ridge.fit(X_train, y_train)

# Best model
best_ridge = grid_search_ridge.best_estimator_
y_pred_ridge = best_ridge.predict(X_test)
ridge_results = evaluate_regression(y_test, y_pred_ridge, f"Ridge(alpha={grid_search_ridge.best_params_['alpha']})")
print(f"Best alpha from GridSearchCV (Ridge): {grid_search_ridge.best_params_['alpha']}")

Ridge(alpha=100.0)
MAE: 23.3404
RMSE: 28.1560
R²: -0.0105
Best alpha from GridSearchCV (Ridge): 100.0


#### 1.3 Lasso Regression with Hyperparameter Tuning

In [13]:
# Lasso Regression with Hyperparameter Tuning
lasso = Lasso()
param_grid_lasso = {'alpha': [0.1, 1.0, 10.0, 100.0]}  # Range of alpha values to test
grid_search_lasso = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='r2', n_jobs=-1)
grid_search_lasso.fit(X_train, y_train)

# Best model
best_lasso = grid_search_lasso.best_estimator_
y_pred_lasso = best_lasso.predict(X_test)
lasso_results = evaluate_regression(y_test, y_pred_lasso, f"Lasso(alpha={grid_search_lasso.best_params_['alpha']})")
print(f"Best alpha from GridSearchCV (Lasso): {grid_search_lasso.best_params_['alpha']}")

Lasso(alpha=1.0)
MAE: 23.1776
RMSE: 28.0235
R²: -0.0010
Best alpha from GridSearchCV (Lasso): 1.0


In [14]:
results = [lr_results, poly_results, ridge_results, lasso_results]
for result in results:
    print(f"\n{result['model']}\nMAE: {result['mae']:.4f}\n RMSE: {result['rmse']:.4f}\n R²: {result['r2']:.4f}")


LinearRegression
MAE: 23.3569
 RMSE: 28.1714
 R²: -0.0116

Poly(d=2)+LinearRegression
MAE: 23.9043
 RMSE: 28.9431
 R²: -0.0678

Ridge(alpha=100.0)
MAE: 23.3404
 RMSE: 28.1560
 R²: -0.0105

Lasso(alpha=1.0)
MAE: 23.1776
 RMSE: 28.0235
 R²: -0.0010


### 2. RandomForest

In [15]:
# Random Forest Regressor
print("\nTraining Random Forest Regressor with GridSearchCV...")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_leaf': [2, 4]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, scoring='r2', verbose=1)
grid_search_rf.fit(X_train, y_train)

print(f"\nBest RF params: {grid_search_rf.best_params_}")
y_pred_rf = grid_search_rf.predict(X_test)
results.append(evaluate_regression(y_test, y_pred_rf, model_name="Random Forest Regressor"))



Training Random Forest Regressor with GridSearchCV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits



Best RF params: {'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 200}
Random Forest Regressor
MAE: 23.3765
RMSE: 28.2161
R²: -0.0148


### 3. XGBoost

In [16]:
print("\nTraining XGBoost Regressor with GridSearchCV...")
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1]
}
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, n_jobs=-1, scoring='r2', verbose=1)
grid_search_xgb.fit(X_train, y_train)

print(f"\nBest XGBoost params: {grid_search_xgb.best_params_}")
y_pred_xgb = grid_search_xgb.predict(X_test)
results.append(evaluate_regression(y_test, y_pred_xgb, model_name="XGBoost Regressor"))



Training XGBoost Regressor with GridSearchCV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best XGBoost params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
XGBoost Regressor
MAE: 23.3705
RMSE: 28.1568
R²: -0.0106
