# Read in the split data and load the YAML metadata

In [1]:
# Import the required modules

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np
import scipy as sp

import yaml
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin


In [2]:
# Import the split data
X_train_path = 'bin/X_train.csv'
y_train_path = 'bin/y_train.csv'
X_test_path = 'bin/X_test.csv'
y_test_path = 'bin/y_test.csv'

X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path)
X_test = pd.read_csv(X_test_path)
y_test = pd.read_csv(y_test_path)

# print the shape of the data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# print the first 5 rows of the data
print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())
	


(40433, 879)
(40433, 1)
(10109, 879)
(10109, 1)
    x74r   term    ltv   cltv    dti    pti  age_o1  age_o2  age_o3  \
0 -1.898  0.560  0.919  0.905  0.249  1.233   0.000   0.000   0.000   
1  1.294 -1.710  0.068  0.054 -0.628 -0.296   0.000   0.000   0.000   
2  0.230  0.560  0.268  0.255 -0.788 -1.730   0.000   0.000   0.000   
3  1.648  0.560 -1.235 -1.247  0.249  0.659   0.000   0.000   0.000   
4  1.010  0.560 -0.734 -0.746 -0.788 -1.156   0.000   0.000   0.000   

   score_orig_r  ...  close_month_5  close_month_6  close_month_7  \
0         0.144  ...          False          False          False   
1         1.056  ...          False          False          False   
2         0.576  ...          False          False          False   
3        -0.912  ...          False          False          False   
4         0.640  ...          False          False          False   

   close_month_8  close_month_9  close_month_10  close_month_11  \
0          False          False           F

In [3]:
# Set the target variable
target_variable = 'Beta_winsorized'

# print the value of the target variable from the training data to validate it loaded correctly
print(y_train[target_variable].head())


0   1.009
1   1.774
2   1.438
3   1.248
4   0.988
Name: Beta_winsorized, dtype: float64


# Supervised Learning Models: Gradient Boosting, Linear, LASSO, and Random Forest

In [4]:
# Fit a Gradient Boosting Regression model on the training partition, and then evaluate it on the testing partition
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html

params = {'max_depth': 4, 'learning_rate': 0.01, 'random_state':0}
grad_boost_reg = HistGradientBoostingRegressor(**params)
grad_boost_reg.fit(X_train, y_train)

mae = mean_absolute_error(y_test, grad_boost_reg.predict(X_test))
mse = mean_squared_error(y_test, grad_boost_reg.predict(X_test))
rmse = np.sqrt(mse)
grad_boost_reg_dict = {"model": "Gradient Boosting Regression", 
                       "Mean Absolute Error": mae, 
                       "Root Mean Squared Error": rmse}

  y = column_or_1d(y, warn=True)


In [5]:
# Fit a Linear Model on the training partition, and evaluate it on the testing partition
# There is a lot of Multicollinearity in this model because we are putting all predictor variables into the model
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

linear_reg = linear_model.LinearRegression()
linear_reg.fit(X_train,y_train)

mae = mean_absolute_error(y_test, linear_reg.predict(X_test))
mse = mean_squared_error(y_test, linear_reg.predict(X_test))
rmse = np.sqrt(mse)
linear_reg_dict = {"model": "Linear Regression", 
                   "Mean Absolute Error": mae, 
                   "Root Mean Squared Error": rmse}

In [6]:
# Fit a LASSO regression on the training partition, and then evaluate it on the testing partition
# This will shrink most of the variable coefficients to zero for automated variable selection
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso
# Default hyperparameters: alpha=1.0, max_iter=1000, tol=0.0001

linear_reg_with_lasso = linear_model.Lasso(alpha=1.0, max_iter=1000, tol=0.0001)
linear_reg_with_lasso.fit(X_train,y_train)

mae = mean_absolute_error(y_test, linear_reg_with_lasso.predict(X_test))
mse = mean_squared_error(y_test, linear_reg_with_lasso.predict(X_test))
rmse = np.sqrt(mse)
lasso_reg_dict = {"model": "LASSO Regression", 
                   "Mean Absolute Error": mae, 
                   "Root Mean Squared Error": rmse}

In [8]:
# Use the LASSO regression to identify the most useful variable to predict the target (i.e., the variables whose coefficient didn't shrink to zero)

variable_names = list(X_train.columns) 
linear_reg_with_lasso_coef = linear_reg_with_lasso.coef_

for variable, coef in zip(variable_names, linear_reg_with_lasso_coef):
    # if the coefficient didn't shrink to zero
    if (coef != 0):
        # if it's a categorical variable
        if variable in new_categorical_variables:
            variable_substrings = variable.split("_")  # split up the new categorical variable name by underscores
            categorical_variable = '_'.join(variable_substrings[:-1])  # retrieve the original categorical variable name (e.g., 'x05a', 'perf_status_0923', etc.)
            category = variable_substrings[-1:][0]  # retrieve the category chosen for the categorical variable (e.g., '1', '10', '2013', 'A', etc.)
            if any(character.isdigit() for character in category):
                category = int(category)
            else:
                category = str(category)
            print(variable, ":", variable_labels_dict[categorical_variable], ":", categorical_variables_categories_dict[variable_formats_dict[categorical_variable]][category], ":", coef)
        # else it's a numeric variable
        else:
            print(variable, ":", variable_labels_dict[variable], ":", coef)

In [9]:
# Fit a Random Forest regression model on the training partition, and then evaluate it on the testing partition
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

params = {'n_estimators': 100, 'max_depth': 4, 'random_state':0}
random_forest_reg = RandomForestRegressor(**params)
random_forest_reg.fit(X_train, y_train)

mae = mean_absolute_error(y_test, random_forest_reg.predict(X_test))
mse = mean_squared_error(y_test, random_forest_reg.predict(X_test))
rmse = np.sqrt(mse)
random_forest_reg_dict = {"model": "Random Forest Regression", 
                          "Mean Absolute Error": mae, 
                          "Root Mean Squared Error": rmse}

  return fit_method(estimator, *args, **kwargs)


In [None]:
# Use the Random Forest regression model to identify the top N most important feature based upon their respective feature importance values

top_N = 20
feature_importances = pd.Series(data=random_forest_reg.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False, inplace=True)
top_N_features = feature_importances.head(top_N)

# Print the top N feature by importance from Random Forest regression model in descending order
for variable in top_N_features.index:
    feature_importance = top_N_features[variable]
    # if the feature importance is greater than zero
    if (feature_importance > 0):
        # if it's a categorical variable
        if variable in new_categorical_variables:
            variable_substrings = variable.split("_")  # split up the new categorical variable name by underscores
            categorical_variable = '_'.join(variable_substrings[:-1])  # retrieve the original categorical variable name (e.g., 'x05a', 'perf_status_0923', etc.)
            category = variable_substrings[-1:][0]  # retrieve the category chosen for the categorical variable (e.g., '1', '10', '2013', 'A', etc.)
            if any(character.isdigit() for character in category):
                category = int(category)
            else:
                category = str(category)
            print(variable, ":", variable_labels_dict[categorical_variable], ":", categorical_variables_categories_dict[variable_formats_dict[categorical_variable]][category], ":", feature_importance)
        # else it's a numeric variable
        else:
            print(variable, ":", variable_labels_dict[variable], ":", feature_importance)

In [None]:
# Compare the performance metrics for the various models on the holdout testing data

all_model_performance_metrics = [grad_boost_reg_dict,
                                linear_reg_dict,
                                lasso_reg_dict,
                                random_forest_reg_dict]

all_model_performance_metrics_df = pd.DataFrame(all_model_performance_metrics)
print(all_model_performance_metrics_df)

In [None]:
# Cross-Validation Setup
from sklearn.model_selection import cross_val_score, KFold

# Set up 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

print("Starting Gradient Boosting hyperparameter tuning...")
print(f"Training set size: {X_train.shape}")

# Hyperparameter tuning for Gradient Boosting
# Reduced parameter grid to avoid memory issues
gb_param_grid = {
    'max_depth': [10, 15, 20],
    'learning_rate': [.02, .03, .04],
    'max_iter': [500, 1000, 1500]
}

total_fits = len(gb_param_grid['max_depth']) * len(gb_param_grid['learning_rate']) * len(gb_param_grid['max_iter']) * 5  # 5-fold CV
print(f"Total model fits to perform: {total_fits}")

# Use verbose=2 to see progress, n_jobs=2 to avoid memory issues
gb_grid = GridSearchCV(
    HistGradientBoostingRegressor(random_state=0),
    gb_param_grid,
    cv=cv,
    scoring='neg_mean_absolute_error',
    n_jobs=2,  
    verbose=2  # Shows progress
)

print("\nFitting models...")
gb_grid.fit(X_train, y_train)
grad_boost_reg = gb_grid.best_estimator_

print(f"\nBest parameters found: {gb_grid.best_params_}")
print(f"Best CV score: {-gb_grid.best_score_:.3f}")

# Evaluate with cross-validation on best model
print("\nPerforming cross-validation on best model...")
cv_scores = cross_val_score(grad_boost_reg, X_train, y_train, cv=cv, 
                           scoring='neg_mean_absolute_error', verbose=1)
print(f"CV MAE: {-cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Test set evaluation
print("\nEvaluating on test set...")
mae = mean_absolute_error(y_test, grad_boost_reg.predict(X_test))
rmse = np.sqrt(mean_squared_error(y_test, grad_boost_reg.predict(X_test)))

print(f"Test MAE: {mae:.3f}")
print(f"Test RMSE: {rmse:.3f}")

grad_boost_cv_dict = {
    "model": "Gradient Boosting Regression (Cross-Validation)", 
    "Mean Absolute Error": mae, 
    "Root Mean Squared Error": rmse,
    "Best Parameters": gb_grid.best_params_
}
print("\nGradient Boosting tuning complete!")

In [None]:
# Compare the performance metrics for the various models on the holdout testing data

all_model_performance_metrics = [grad_boost_cv_dict,
                                grad_boost_reg_dict,    
                                linear_reg_dict,
                                lasso_reg_dict,
                                random_forest_reg_dict,]

all_model_performance_metrics_df = pd.DataFrame(all_model_performance_metrics)
print(all_model_performance_metrics_df)

In [None]:
print(grad_boost_cv_dict)