# Zillow Stacking

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import time
from datetime import datetime

from sami_function import missing_ratio
from zillow_functions import create_newFeatures, data_preprocessing, memory_reduce, create_special_feature


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
from sklearn import linear_model

%matplotlib inline
seed = 42
n_features = 100

In [2]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [3]:
%%time

print('\nLoad preprocessed data ...')
df_train =  pd.read_csv('../data/train_processed.csv', sep=';')


print('\nCreating training set ...')
x_train = df_train.drop(['parcelid', 'logerror'], axis=1)
y_train = df_train['logerror'].values
y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)

# print('\nFeature scaling ...')
# sc = StandardScaler()
# x_train = sc.fit_transform(x_train)



Loading data ...




Shape train: (90275, 60)

Data preprocessing ...

	Outliers treated ...

Creating new features ...

Reducing consumption memory ...
	Initial size 46.84 MB
	There are 0 columns that cannot be reduced
	There are 80 columns reduced
	Final size 17.57 MB

Creating training set ...
(90275, 78) (90275,)

Features selection ...
Wall time: 43 s


In [4]:
from zillow_stacking import AveragingModels, StackingAveragedModels, mae_cv

In [16]:
%%time

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.1, random_state=seed))

ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.1, l1_ratio=.9, random_state=seed))

GBoost = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, criterion='mae',
                                   max_depth=4, max_features='sqrt', verbose=2,
                                   min_samples_leaf=15, min_samples_split=10, 
                                   subsample=0.8, loss='huber', random_state =seed)

model_xgb = xgb.XGBRegressor(learning_rate=0.031, max_depth=8, 
                             min_child_weight=1, n_estimators=1000,
                             objective='reg:linear', eval_metric='mae', base_score=0.010406,
                             gamma=0, subsample=0.8, silent=1,
                             random_state=seed, nthread=-1)

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=32, metric='mae',
                              learning_rate=0.0025, n_estimators=720, max_depth=100,
                              max_bin=55, bagging_fraction=0.95,
                              bagging_freq=8, feature_fraction=0.85,
                              feature_fraction_seed=seed, bagging_seed=seed,
                              min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

Wall time: 502 µs


In [None]:
%%time

score = mae_cv(x_train, y_train, GBoost, 5)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [6]:
%%time

score = mae_cv(x_train, y_train, lasso, 5)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = mae_cv(x_train, y_train, ENet, 5)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.0604 (0.0020)

ElasticNet score: 0.0604 (0.0020)

Wall time: 9.64 s


In [None]:
%%time
score = mae_cv(x_train, y_train, model_xgb, 5)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = mae_cv(x_train, y_train, model_lgb, 5)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [7]:
averaged_models = AveragingModels(models = (ENet, GBoost, lasso))

score = mae_cv(x_train, y_train, averaged_models, 5)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.0604 (0.0020)



In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = mae_cv(x_train, y_train, stacked_averaged_models, 5)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))