# Stacked Models

## Prerequisites 

In [1]:
# Helper packages
import pandas as pd
import math

# Modeling packages
from sklearn.model_selection import train_test_split
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
# Ames housing data
ames = pd.read_csv("../data/ames.csv")

# create train/test split
train, test = train_test_split(ames, train_size=0.7, random_state=123)

# separate features from labels and only use numeric features
X_train = train.drop("Sale_Price", axis=1)
y_train = train[["Sale_Price"]]

## Implementing stacking

In [None]:
# Ordinal encode our quality-based features 
ord_cols = list(X_train.filter(regex=("Qual$|QC$|Cond$")).columns)
lvs = ["Very_Poor", "Poor", "Fair", "Below_Average", "Average", "Typical", 
       "Above_Average", "Good", "Very_Good", "Excellent", "Very_Excellent"]
val = range(0, len(lvs))
lvl_map = dict(zip(lvs, val))
category_mapping = [{'col': col, 'mapping': lvl_map} for col in ord_cols]
ord_encoder = OrdinalEncoder(cols=ord_cols, mapping=category_mapping)

# one hot encode remaining nominal features
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

# combine into a pre-processing pipeline
preprocessor = ColumnTransformer(
  remainder="passthrough",
  transformers=[
   ("ord_encode", ord_encoder, ord_cols),
   ("one-hot", encoder, selector(dtype_include="object")),
   ]
  )

In [None]:
# linear model
lm_mod = linear_model.LinearRegression()

# decision tree model
dt_mod = DecisionTreeRegressor(
  ccp_alpha=0.1, 
  max_depth=15, 
  min_samples_split=40
)

# random forest model
rf_mod = RandomForestRegressor(
  n_estimators=1000,
  max_features=0.21,
  max_samples=0.65,
  min_samples_leaf=1,
  bootstrap=False
)

# XGBoost GBM model
xgb_mod = xgb.XGBRegressor(
  n_estimators=5000,
  learning_rate=0.1,
  max_depth=3,
  min_child_weight=1,
  subsample=1,
  colsample_bytree=0.75,
  colsample_bylevel=0.75,
  colsample_bynode=0.75
)

In [None]:
# linear model pipeline
lm_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("lm_mod", lm_mod),
])

# decision tree pipeline
dt_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("dt_mod", dt_mod),
])

# random forest pipeline
rf_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("rf_mod", rf_mod),
])

# XGBoost pipeline
xgb_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("xgb_mod", xgb_mod),
])

In [None]:
estimators = [
  ('Linear regression', lm_pipeline),
  ('Decision tree', dt_pipeline),
  ('Random forest', rf_pipeline),
  ('XGBoost', xgb_pipeline)
  ]
              
stacking_regressor = StackingRegressor(
  estimators=estimators, 
  final_estimator=RidgeCV()
)

In [None]:
# create 5 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# perform 5-fold cross validation
results = cross_val_score(
  stacking_regressor, 
  X_train, 
  y_train, 
  cv=kfold, 
  scoring='neg_root_mean_squared_error'
)

# get average CV RMSE
abs(results.mean())

## Exercises

Using the Boston housing data set, where the response feature is the median value of homes within a census tract (`cmedv`):

1. Recreate the optimal models identified from the exercises in the [linear regression](https://misk-data-science.github.io/misk-homl/docs/notebooks/04-linear-regression.html#Exercises),  [decision tree](https://misk-data-science.github.io/misk-homl/docs/notebooks/09-decision-trees.html#Exercises), [random forest](https://misk-data-science.github.io/misk-homl/docs/notebooks/11-random-forests.html#Exercises), and [gradient boosting](https://misk-data-science.github.io/misk-homl/docs/notebooks/12-gbm.html#Exercises) modules.
2. Apply a stacked model and compare the model performance to the individual models.
3. Now repeat 1 & 2 for the Attrition dataset, which is classification model rather than a regression model.