# Ames Housing Data

#### Imports

In [1]:
import os
import math
import warnings

In [2]:
# Default Libraries
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import seaborn           as sns

# Model Selection
import lightgbm as lgb

In [3]:
# Preprocessing
from sklearn.preprocessing   import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics         import mean_squared_error
from scipy.special           import boxcox1p

# Model Selection
from sklearn.model_selection import GridSearchCV, KFold
from xgboost                 import XGBRegressor, plot_importance
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.base            import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.linear_model    import Lasso, Ridge, ElasticNet
from sklearn.pipeline        import make_pipeline
from sklearn.kernel_ridge    import KernelRidge
from mlxtend.regressor       import StackingCVRegressor

In [4]:
%matplotlib inline

In [5]:
pd.set_option( 'display.max_columns', None )
sns.set( rc = { 'figure.figsize' : ( 10, 5 ) } )
warnings.filterwarnings( 'ignore' )

#### Global Functions

In [6]:
def get_numerical_features( df ):
    return df.select_dtypes( include = [ 'int64', 'float64' ] ).columns

def get_categorical_features( df ):
    return df.select_dtypes( include = [ 'object' ] ).columns

def return_features_with_null( df ):
    still_missing = pd.DataFrame( len( df[ get_categorical_features ] ) - df[ get_categorical_features ].count() )
    return pd.DataFrame( still_missing[ still_missing[ 0 ] > 0 ] )

def return_rows_with_null( df ):
    null_columns = df.columns[ df.isnull().any() ]
    print( pd.DataFrame( df[ df.isnull().any( axis = 1 ) ][ null_columns ].head( 10 ) ) )

def na_heatmap( df ):
    df      = df[ sorted( df.columns ) ]
    fig, ax = plt.subplots( figsize = ( 25, 5 ) )
    sns.heatmap( df.isnull(), yticklabels = False, cbar = False )

#### Load Data

In [7]:
data_path = os.getcwd() + '\\..\\..\\..\\data\\ames_housing\\'
train_raw = pd.read_csv( data_path + 'train.csv' )
test_raw  = pd.read_csv( data_path + 'test.csv' )

#### Explore the Data

###### Data Overview

In [8]:
print( "Train: {} \nTest: {}".format( train_raw.shape, test_raw.shape ) )

Train: (1460, 81) 
Test: (1459, 80)


In [9]:
# train_X.head( 3 )

In [10]:
# train_X.describe()

In [11]:
# facet_grid = pd.melt( df, value_vars = sorted( get_numerical_features( df ) ) )
# grid_plot  = sns.FacetGrid( facet_grid, col = 'variable', col_wrap = 10, sharex = False, sharey = False)
# grid_plot.map( sns.distplot, 'value' )

In [12]:
# facet_grid = pd.melt( df, value_vars = sorted( get_categorical_features( df ) ) )
# grid_plot  = sns.FacetGrid( facet_grid, col = 'variable', col_wrap = 10, sharex = False, sharey = False )
# grid_plot  = grid_plot.map( sns.countplot, 'value' )

# plt.xticks( rotation = 'vertical' )
# [ plt.setp( ax.get_xticklabels(), rotation = 60 ) for ax in grid_plot.axes.flat ]
# grid_plot.fig.tight_layout()

In [13]:
# plt.subplots( figsize = ( 15, 15 ) )
# sns.heatmap( train_X.corr(), vmax = 1, square = True, cmap = 'magma', linecolor = 'white', linewidth = 0.1 )

###### Correct Outliers

In [14]:
# sns.scatterplot( x = 'GrLivArea', y = 'SalePrice', data = train_raw )

In [15]:
train_raw = train_raw.drop( train_raw[ ( train_raw[ 'GrLivArea' ] > 4000 ) & ( train_raw[ 'SalePrice' ] < 250000 ) ].index )

In [16]:
train_X = train_raw.drop( [ 'SalePrice', 'Id' ], axis = 1 )
train_y = train_raw[ 'SalePrice' ]
test_X  = test_raw.drop( [ 'Id' ], axis = 1 )

###### Transform Response

In [17]:
# sns.distplot( train_y, bins = 75 )

In [18]:
print( 'Skew: {} \nKurtosis: {}'.format( round( train_y.skew(), 4 ), 
                                         round( train_y.kurtosis(), 4 ) ) )

Skew: 1.8813 
Kurtosis: 6.5231


In [19]:
train_y = np.log1p( train_y )
# sns.distplot( train_y, bins = 75 )

In [20]:
print( 'Skew: {} \nKurtosis: {}'.format( round( train_y.skew(), 4 ), 
                                         round( train_y.kurtosis(), 4 ) ) )

Skew: 0.1216 
Kurtosis: 0.8048


In [21]:
full_X    = pd.concat( [train_X, test_X] )
train_end = len( train_X )
test_end  = len( full_X )

print( full_X.shape )

(2917, 79)


In [22]:
# na_heatmap( full_X )

In [23]:
categorical_features = list( get_categorical_features( full_X ) )
numerical_features   = list( get_numerical_features( full_X ) )

#### Data Preprocessing

###### Replace Missing Values

In [24]:
# na_heatmap( full_X )

In [25]:
fill_with_none = [ 'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'Fence', 'FireplaceQu', 
                   'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'MasVnrType', 'MiscFeature', 'MSSubClass', 'PoolQC' ]

fill_with_zero = [ 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'TotalBsmtSF', 'GarageYrBlt', 'MasVnrArea' ]

full_X[ fill_with_none ] = full_X[ fill_with_none ].fillna( 'None' )
full_X[ fill_with_zero ] = full_X[ fill_with_zero ].fillna( 0 )

###### Drop Useless Feature

In [26]:
len( full_X[ full_X[ 'Utilities' ] == 'AllPub' ] ) / len( full_X )

0.9989715461090161

In [27]:
full_X = full_X.drop( [ 'Utilities' ], axis = 1 )

###### Impute Remaining NaN

In [28]:
# Kaggle says NA is Typ, which after ranking is value 8
full_X[ 'Functional' ] = full_X["Functional"].fillna( 'Typ' )

In [29]:
missing_with_mode           = [ 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'MSZoning' ]
full_X[ missing_with_mode ] = full_X[ missing_with_mode ].fillna( full_X.mode().iloc[0] )

In [30]:
full_X[ 'LotFrontage' ] = full_X.groupby( 'Neighborhood' )[ 'LotFrontage' ].transform( lambda x: x.fillna( x.median() ) )

In [31]:
# return_features_with_null( full_X )

In [32]:
# return_rows_with_null( full_X )

In [33]:
full_X[ 'MSSubClass' ]  = full_X['MSSubClass'].apply(str)
full_X[ 'OverallCond' ] = full_X['OverallCond'].astype(str)
full_X[ 'YrSold' ]      = full_X['YrSold'].astype(str)
full_X[ 'MoSold' ]      = full_X['MoSold'].astype(str)

###### Create Ranked Features

In [34]:
full_X[ 'Alley'        ].replace( { 'None' : 0, 'Grvl' : 1, 'Pave' : 2 }, inplace = True )
full_X[ 'BsmtCond'     ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4 }, inplace = True )
full_X[ 'BsmtExposure' ].replace( { 'None' : 0, 'No' : 1, 'Mn' : 2, 'Av' : 3, 'Gd' : 4 }, inplace = True )
full_X[ 'BsmtFinType1' ].replace( { 'None' : 0, 'Unf' : 1, 'LwQ' : 2, 'Rec' : 3, 'BLQ' : 4, 'ALQ' : 5, 'GLQ' : 6 }, inplace = True )
full_X[ 'BsmtFinType2' ].replace( { 'None' : 0, 'Unf' : 1, 'LwQ' : 2, 'Rec' : 3, 'BLQ' : 4, 'ALQ' : 5, 'GLQ' : 6 }, inplace = True )
full_X[ 'BsmtQual'     ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'ExterCond'    ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'ExterQual'    ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'Fence'        ].replace( { 'None' : 0, 'MnWw' : 1, 'GdWo' : 2, 'MnPrv' : 3, 'GdPrv' : 4 }, inplace = True )
full_X[ 'FireplaceQu'  ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'Functional'   ].replace( { 'None' : 0, 'Sal' : 1, 'Sev' : 2, 'Maj2' : 3, 'Maj1' : 4, 'Mod' : 5, 'Min2' : 6, 'Min1' : 7, 'Typ' : 8 }, inplace = True )
full_X[ 'GarageCond'   ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'GarageFinish' ].replace( { 'None' : 0, 'Unf' : 1, 'RFn' : 2, 'Fin' : 3 }, inplace = True )
full_X[ 'GarageQual'   ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'HeatingQC'    ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'KitchenQual'  ].replace( { 'None' : 0, 'Po' : 1, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'LandSlope'    ].replace( { 'None' : 0, 'Sev' : 1, 'Mod' : 2, 'Gtl' : 3 }, inplace = True )
full_X[ 'LandContour'  ].replace( { 'None' : 0, 'Low' : 1, 'HLS' : 2, 'Bnk' : 3, 'Lvl' : 4 }, inplace = True )
full_X[ 'LotShape'     ].replace( { 'None' : 0, 'Reg' : 1, 'IR1' : 2, 'IR2' : 3, 'IR3' : 4 }, inplace = True )
full_X[ 'PoolQC'       ].replace( { 'None' : 0, 'Fa' : 2, 'TA' : 3, 'Gd' : 4, 'Ex' : 5 }, inplace = True )
full_X[ 'PavedDrive'   ].replace( { 'None' : 0, 'N' : 1, 'P' : 2, 'Y' : 3 }, inplace = True )

###### Add Feature

In [35]:
full_X[ 'TotalLivAreaSF' ] = full_X[ '1stFlrSF'     ] + full_X[ '2ndFlrSF' ]
full_X[ 'TotalOtherSF' ]   = full_X[ 'TotalBsmtSF'  ] + full_X[ 'GrLivArea' ] 
full_X[ 'TotalBath'   ]    = full_X[ 'BsmtFullBath' ] + ( 0.5 * full_X[ 'BsmtHalfBath' ] ) + full_X[ 'FullBath' ] + ( 0.5 * full_X[ 'HalfBath' ] )
full_X[ 'TotalPorchSF' ]   = full_X[ 'OpenPorchSF'  ] + full_X[ 'EnclosedPorch' ] + full_X[ '3SsnPorch' ] + full_X[ 'ScreenPorch' ]

full_X['HasBasement']  =  full_X[ 'TotalBsmtSF' ].apply(lambda x: 1 if x > 0 else 0)
full_X['HasGarage']    =  full_X[ 'GarageArea' ].apply(lambda x: 1 if x > 0 else 0)
full_X['HasPorch']     =  full_X[ 'TotalPorchSF' ].apply(lambda x: 1 if x > 0 else 0)
full_X['HasPool']      =  full_X[ 'PoolArea' ].apply(lambda x: 1 if x > 0 else 0)
full_X['WasRemodeled'] = (full_X[ 'YearRemodAdd' ] != full_X[ 'YearBuilt' ] ).astype(np.int64)
full_X['IsNew']        = (full_X[ 'YearBuilt' ] > 2000).astype(np.int64)
full_X['WasCompleted'] = (full_X[ 'SaleCondition' ] != 'Partial').astype(np.int64)

###### BoxCox Transformation

In [36]:
numeric_feats = get_numerical_features( full_X )
skew_features = {}

for feature in numeric_feats:
    skew_features[ feature ] = full_X[ feature ].skew()
    
skew_features = pd.DataFrame( { 'Features' : list( skew_features.keys() ), 
                                'Skew'     : list( skew_features.values() ) } )

features_to_box = list( skew_features[ abs( skew_features[ 'Skew' ] ) > 0.75 ][ 'Features' ] )

In [37]:
for feature in features_to_box:
    full_X[ [feature] ] = boxcox1p( full_X[ [feature] ], 0.15 )

###### Log Transformation

In [38]:
features_to_log = pd.DataFrame( { 'Kurtosis' : round( full_X.kurtosis(), 4 ), 'Skew' : full_X.skew() } )
features_to_log = list( np.setdiff1d( list( features_to_log[ features_to_log[ 'Kurtosis' ] > 1.15 ].index ), categorical_features ) )
features_to_log.remove( 'MSSubClass' )
features_to_log.remove( 'OverallCond' )

In [39]:
for feature in features_to_log:
    full_X[ [feature] ] = np.log1p( full_X[ [feature] ] )

###### Create One-Hot-Encoded

In [40]:
categorical_features = list( get_categorical_features( full_X ) )

In [41]:
full_X = pd.get_dummies( full_X, 
                         drop_first = True, 
                         prefix     = categorical_features, 
                         columns    = categorical_features )

###### Split Back to Train/Test

In [42]:
train_X = pd.DataFrame( full_X[ 0:train_end ] )
test_X  = pd.DataFrame( full_X[ train_end:test_end ] )

print( "Train: {} \nTest: {}".format( train_X.shape, test_X.shape ) )

Train: (1458, 242) 
Test: (1459, 242)


In [43]:
scaler = RobustScaler()

train_X = scaler.fit_transform( train_X )
test_X  = scaler.fit( test_X )

#### Build & Compare Models

###### XGBRegressor

In [44]:
# xgbm_grid = { 
#     'learning_rate'    : [ 0.1 ],
#     'max_depth'        : [ 4 ],
#     'subsample'        : [ 0.75 ],
#     'colsample_bytree' : [ 1 ],
#     'n_estimators'     : [ 100 ],
#     'reg_alpha'        : [ 0 ],
#     'reg_lambda'       : [ 0.25 ] 
# }

# xgbm = GridSearchCV( XGBRegressor(), cv = 5, param_grid = xgbm_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# xgbm.fit( train_X, train_y )

###### GradientBoostingRegressor

In [45]:
# gbr_grid = {
#     'n_estimators'      : [ 3500 ],
#     'learning_rate'     : [ 0.025 ],
#     'max_depth'         : [ 3 ],
#     'max_features'      : [ 'log2' ],
#     'min_samples_leaf'  : [ 10 ],
#     'min_samples_split' : [ 2 ],
#     'loss'              : [ 'huber' ]
# }

# gbr = GridSearchCV( GradientBoostingRegressor(), cv = 5, param_grid = gbr_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# gbr.fit( train_X, train_y )

###### Lasso

In [46]:
# lasso_grid = { 
#     'alpha'    : [ 0.00058 ], 
#     'max_iter' : [ 300 ] 
# }

# lasso = GridSearchCV( Lasso(), cv = 5, param_grid = lasso_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# lasso.fit( train_X, train_y )

###### Kernel Ridge

In [47]:
# kernel_grid = {
#     'alpha'  : [ 1 ],
#     'kernel' : [ 'linear' ],
#     'degree' : [ 1 ],
#     'coef0'  : [ 0 ]
# }

# kridge = GridSearchCV( KernelRidge(), cv = 5, param_grid = kernel_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# kridge.fit( train_X, train_y )

###### Elastic Net

In [48]:
# elastic_grid = {
#     'alpha'        : [ 0.001 ],
#     'l1_ratio'     : [ 0.5 ],
#     'normalize'    : [ True ],
#     'max_iter'     : [ 2500 ],
#     'random_state' : [ 2 ]
# }

# elastic = GridSearchCV( ElasticNet(), cv = 5, param_grid = elastic_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# elastic.fit( train_X, train_y )

###### LightGBM

In [49]:
# lgbm_grid = {
#     'objective'        : [ 'regression' ],
#     'n_estimators'     : [ 500 ],
#     'learning_rate'    : [ 0.05 ],
#     'num_leaves'       : [ 5 ],
#     'max_bin'          : [ 60 ],
#     'bagging_fraction' : [ 0.5 ],
#     'feature_fraction' : [ 0.25 ],
#     'colsample_bytree' : [ 0.5 ],
#     'reg_alpha'        : [ 0 ],
#     'reg_lambda'       : [ 0.25 ],
#     'subsample'        : [ 0.5 ]
# }

# lgbm = GridSearchCV( lgb.LGBMRegressor(), cv = 5, param_grid = lgbm_grid, n_jobs = -1, scoring = 'neg_mean_squared_error', verbose = 1 )
# lgbm.fit( train_X, train_y )

#### Run Selected Model on Data

In [50]:
# XGBRegressor
# ==========
xgbm_model = XGBRegressor(
colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1
)

# GradientBoostingRegressor
# ==================================================
gbr_model = GradientBoostingRegressor( 
   n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5
)

# Lasso
# ==================================================
lasso_model = Lasso(
alpha =0.0005, random_state=1
) 

# KernelRidge
# ==================================================
kridge_model = KernelRidge(
  alpha=0.6, kernel='polynomial', degree=2, coef0=2.5
) 

# ElasticNet
# ==================================================
elastic_model = ElasticNet(
alpha=0.0005, l1_ratio=.9, random_state=3
) 

# LightGBM
# ==================================================
lgbm_model = lgb.LGBMRegressor(
    objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11
)

In [51]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [58]:
# Stacked CV Regressor
stacked_model = StackingAveragedModels( base_models = ( elastic_model, gbr_model, kridge_model ),
                                        meta_model  = lasso_model )

stacked_model.fit( train_X.values, train_y.values )
stacked_predict = np.expm1( stacked_model.predict( test_X.values ) )

# LightGBM
lgbm_model.fit( train_X, train_y )
lgbm_predict = np.expm1( lgbm_model.predict( test_X ) )

#XGBRegressor
xgbm_model.fit( train_X, train_y )
xgbm_predict = np.expm1( xgbm_model.predict( test_X ) )

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:
# Ensemble
ensemble = stacked_predict * 0.70 +\
           lgbm_predict * 0.15 +\
           xgbm_predict * 0.15

#### Create Submission

In [None]:
submission = pd.DataFrame(
    { 'Id'        : test_raw[ 'Id' ],
      'SalePrice' : ensemble } 
)

submission.to_csv( '.\\ames_housing_submit_3.csv', index = False )
submission.head( 5 )

In [None]:
# 1	Id	SalePrice
# 2	1461	119993.52799010304
# 3	1462	160505.5055045645
# 4	1463	186796.66681037686
# 5	1464	196605.93965842467