This analysis segment will focus on stacking the optimized models. Stacking is an ensemble method that combines the predictive power of multiple models to generate a new model. This can often lead to improved model results.

##Read in data

In [None]:
import pandas as pd
x_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
x_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [None]:
pip install pygam

Collecting pygam
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting scipy<1.12,>=1.11.1 (from pygam)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading pygam-0.9.1-py3-none-any.whl (522 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.0/522.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, pygam
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
      Successfully uninstalled scipy-1.13.1
[31mERROR: pip's dependency resolver does not currently take int

Will stack optimized models, this refers to combining the models in order to attempt to improve performance.

In [None]:
from math import sqrt
from sklearn.model_selection import RepeatedKFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import r2_score
#Linear Regression
from sklearn.linear_model import LinearRegression
linear_regression_model = LinearRegression()
#Ridge
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
ridge_model = Ridge(alpha=0.99)
#Lasso
from sklearn import linear_model
lasso_model= linear_model.Lasso(alpha=0.05)
#Zip
import statsmodels.api as sm
# Note: ZIP requires both endog and exog to be specified when fitting.
# Here we're just initializing the model without fitting it.
zip_model = None, #inflation='probit'
#Trees
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(max_depth=30, max_features= 'log2', min_samples_leaf= 2, min_samples_split= 2)
#Random Forrest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(max_depth=9, max_features='log2', max_leaf_nodes = 6, n_estimators =50)
#SVM - SVR
from sklearn import svm
SVM = svm.SVR(C=20, gamma=0.03, kernel='poly')
#Neural network
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
mlp = MLPRegressor(activation='relu', alpha= 0.001, solver= 'adam')
#Generalized linear model
import statsmodels.api as sm
# Note: GLM requires both endog and exog to be specified when fitting.
# Here we're just initializing the model without fitting it.
glm_model = sm.GLM(endog=y_train, exog=x_train, family=sm.families.Gaussian())
#Generalized additive model
from pygam import LinearGAM, s
gam = LinearGAM(n_splines = 30, spline_order= 2)

Zip and GLM require personalised data inputs and will therfore be removed from the stacked model.

In [None]:
models = [linear_regression_model, ridge_model, lasso_model, dt_model, rf_model, SVM, mlp]
model_names = ['Linear Regression', 'Ridge', 'Lasso', 'Decision Tree', 'Random Forest', 'SVM', 'MLP']

In [None]:
models

[LinearRegression(),
 Ridge(alpha=0.99),
 Lasso(alpha=0.05),
 DecisionTreeRegressor(max_depth=30, max_features='log2', min_samples_leaf=2),
 RandomForestRegressor(max_depth=9, max_features='log2', max_leaf_nodes=6,
                       n_estimators=50),
 SVR(C=20, gamma=0.03, kernel='poly'),
 MLPRegressor(alpha=0.001)]

XGboost is chosen as the meta-regressor in stacking in this instance because it is high performing, robust to overfitting and able to handle complex relationships. These traits make it important when considering such a variety of regression model inputs.

In [None]:
from mlxtend.regressor import StackingCVRegressor
import xgboost as xg
stack = StackingCVRegressor(regressors=models,
                            meta_regressor=xg.XGBRegressor(n_estimators=60, max_depth = 3, learning_rate = 0.2),
                            cv=10,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False
                           )

The stacking model is trained and evaluated.

In [None]:
stack.fit(x_train, y_train)
pred = stack.predict(x_val)
score_mae = mean_absolute_error(y_val, pred)
score_mse = mean_squared_error(y_val, pred)
score_r2=r2_score(y_val, pred)*100
score_rmse =  sqrt(score_mse)
print('Model: {0}, MAE: {1}'.format(type(stack).__name__, score_mae))
print('Model: {0}, MSE: {1}'.format(type(stack).__name__, score_mse))
print('Model: {0}, RMSE: {1}'.format(type(stack).__name__, score_rmse))
print('Model: {0}, R2: {1}'.format(type(stack).__name__, score_r2))

  y = column_or_1d(y, warn=True)


Model: StackingCVRegressor, MAE: 0.18495264383343676
Model: StackingCVRegressor, MSE: 0.08505518904985397
Model: StackingCVRegressor, RMSE: 0.29164222782349947
Model: StackingCVRegressor, R2: -77.69688095117336




In [None]:
stacking_results = []
stacking_results.append({'Model': 'Stacked Model', 'Mean Absolute Error': score_mae, 'Mean Squared Error': score_mse, 'RMSE': score_rmse,'R-squared': score_r2})

df = pd.DataFrame(stacking_results)
df

Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,RMSE,R-squared
0,Stacked Model,0.184953,0.085055,0.291642,-77.696881


In [None]:
from google.colab import files
df.to_csv('Stacking_results.csv', index=None)
files.download("Stacking_results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>