In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from math import sqrt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from IPython.display import display, FileLink
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# House prices predictions: stacking models

Another attempt to improve upon my Housing Prices Prediction score using stacked models.

See [linear-regression](./linear-regression.ipynb) for feature engineering.

In [2]:
PATH = Path('./data')

In [3]:
df_train = pd.read_feather(PATH / 'df_numeric')
df_test = pd.read_feather(PATH / 'df_test')
sale_price_log = pd.read_feather(PATH / 'sale_price_log')
test_house_ids = pd.read_feather(PATH / 'test_house_ids')

sale_price_log = sale_price_log['SalePrice']
test_house_ids = test_house_ids['Id']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, sale_price_log, test_size=0.2, random_state=42, shuffle=True)

In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

### Lasso

In [8]:
lasso_model = Lasso(alpha=0.0004)
scores = np.sqrt(
    -cross_val_score(lasso_model, df_train, sale_price_log, cv=kf, scoring='neg_mean_squared_error'))

In [7]:
scores.mean()

0.10823729456698991

In [8]:
lasso_model.fit(X_train, y_train)
lasso_val_preds = lasso_model.predict(X_val)
print(f"RMSE on val set: {sqrt(((lasso_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.11070044379233307


### Ridge

In [9]:
ridge_model = Ridge(alpha=30, fit_intercept = True)
scores = np.sqrt(
    -cross_val_score(ridge_model, df_train, sale_price_log, cv=kf, scoring='neg_mean_squared_error'))

In [10]:
scores.mean()

0.10958086474232627

In [11]:
ridge_model.fit(X_train, y_train)
ridge_val_preds = ridge_model.predict(X_val)
print(f"RMSE on val set: {sqrt(((ridge_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.1160855649761875


### ElasticNet

In [10]:
elasticnet_model = ElasticNet(alpha=0.0006, l1_ratio=0.8)
scores = np.sqrt(
    -cross_val_score(elasticnet_model, df_train, sale_price_log, cv=kf, scoring='neg_mean_squared_error'))

In [13]:
scores.mean()

0.10810810326324131

In [14]:
elasticnet_model.fit(X_train, y_train)
elasticnet_val_preds = elasticnet_model.predict(X_val)
print(f"RMSE on val set: {sqrt(((elasticnet_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.11033188290550583


### GradientBoostingRegressor

In [11]:
gbr_model = GradientBoostingRegressor(
    n_estimators=3000, learning_rate=0.02,
    max_depth=6, max_features='sqrt',
    min_samples_leaf=15, min_samples_split=50,
    loss='huber', random_state = 5)
scores = np.sqrt(
    -cross_val_score(gbr_model, df_train, sale_price_log, cv=kf, scoring='neg_mean_squared_error'))

In [16]:
scores.mean()

0.1135230901245946

In [17]:
gbr_model.fit(X_train, y_train)
gbr_val_preds = gbr_model.predict(X_val)
print(f"RMSE on val set: {sqrt(((gbr_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.12258554724859602


### LightGBM

In [12]:
lgbm_model = LGBMRegressor(
    objective='regression', num_leaves=5,
    learning_rate=0.05, n_estimators=720,
    max_bin = 55, bagging_fraction = 0.8,
    bagging_freq = 5, feature_fraction = 0.2319,
    feature_fraction_seed=9, bagging_seed=9,
    min_data_in_leaf =6, min_sum_hessian_in_leaf=11)
scores = np.sqrt(
    -cross_val_score(lgbm_model, df_train, sale_price_log, cv=kf, scoring='neg_mean_squared_error'))

In [19]:
scores.mean()

0.11593711158827336

In [20]:
lgbm_model.fit(X_train, y_train)
lgbm_val_preds = lgbm_model.predict(X_val)
print(f"RMSE on val set: {sqrt(((lgbm_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.12305123423884004


### Average models

In [21]:
average_preds = (
    lgbm_val_preds +
    gbr_val_preds +
    elasticnet_val_preds +
    ridge_val_preds +
    lasso_val_preds
) / 5

In [22]:
print(f"RMSE on ensemble: {sqrt(((average_preds - y_val)**2).mean())}")

RMSE on ensemble: 0.112181069990364


### Meta-models

Create a meta-model trained on average base models and use out-of-fold predictions to train meta-model.

1. Split the total training set into 2 disjoint sets (here **train** and **holdout**).
2. Train several base models on the first part (**train**).
3. Test base models on the 2nd part (**holdout**).
4. Use the predictions from 3 (out-of-folds predictions) as inputs and the correct responses (target variable) as outputs to train a higher-level learner called **meta-model**.

In [13]:
class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Train the cloned meta-model using out-of-fold predictions
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

In [14]:
stacked_averaged_models = StackingAverageModels(
    base_models = (lgbm_model, gbr_model, elasticnet_model, ridge_model, lasso_model),
    meta_model = Lasso(alpha=0.0005, random_state=1, max_iter=2000))

In [15]:
stacked_averaged_models.fit(X_train.values, y_train.values)

StackingAverageModels(base_models=(LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
       boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.2319, feature_fraction_seed=9,
       learning_rate=0.05, max_bin=55, max_depth=-1, min_child_samples=20,
       min_child_...ve=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)),
           meta_model=Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False),
           n_folds=5)

In [16]:
stacked_averaged_val_preds = stacked_averaged_models.predict(X_val)
print(f"RMSE on val set: {sqrt(((stacked_averaged_val_preds - y_val)**2).mean())}")

RMSE on val set: 0.11263737895611997


Doesn't help on the val set nor on the leaderboard.

Finally, we'll train the model on the whole training set and submit the predictions.

In [33]:
lasso_model.fit(df_train, sale_price_log)
ridge_model.fit(df_train, sale_price_log)
elasticnet_model.fit(df_train, sale_price_log)
gbr_model.fit(df_train, sale_price_log)
lgbm_model.fit(df_train, sale_price_log)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
       boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.2319, feature_fraction_seed=9,
       learning_rate=0.05, max_bin=55, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=6, min_split_gain=0.0,
       min_sum_hessian_in_leaf=11, n_estimators=720, n_jobs=-1,
       num_leaves=5, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

## 4. Submit predictions

In [34]:
lasso_test_preds = lasso_model.predict(df_test)
ridge_test_preds = ridge_model.predict(df_test)
elasticnet_test_preds = elasticnet_model.predict(df_test)
gbr_test_preds = gbr_model.predict(df_test)
lgbm_test_preds = lgbm_model.predict(df_test)

In [35]:
average_preds = (
    lasso_test_preds +
    ridge_test_preds +
    elasticnet_test_preds +
    gbr_test_preds +
    lgbm_test_preds
) / 5

In [17]:
# stacked_test_preds = stacked_averaged_models.predict(df_test.values)
# Best score: 0.11953

In [48]:
pd.DataFrame({'Id': test_house_ids, 'SalePrice': np.exp(average_preds)}).to_csv(f'{PATH}/sub_stack.csv', index=False)

In [50]:
FileLink(str(PATH / 'sub_stack.csv'))

<img src="./images/submission-12.png">