# Kaggle Housing dataset prediction

The following notebook is a ensembling/stacking model for predicting housing sale price for kaggle competition.
On the kaggle public scoreboard, the submission scored 0.12207 being around top 20% on the leader board.

## Stacking models

For the stacking process, we will use a XGBoost as second stacking model. We will take prediction results from base models in **house** notebook.

In [2]:
import pandas as pd
import shutil
import glob, os
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

#get predictions from bsae models
saved_model_files = glob.glob('/content/gdrive/My Drive/Colab Notebooks/house/saved_model/*.csv')
stacking_model_files = glob.glob('/content/gdrive/My Drive/Colab Notebooks/house/stacking_model/*.csv')
saved_model_files.remove('/content/gdrive/My Drive/Colab Notebooks/house/saved_model/labels.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#read base model predictions and save dataframe in a list
saved_models = []
stacking_models = []
for file in stacking_model_files:
    stacking_models.append(pd.read_csv(file))
for file in saved_model_files:
    saved_models.append(pd.read_csv(file))

In [0]:
#concatentate train_set and sort by Id
train_set = stacking_models.pop(0)
for model in stacking_models:
    train_set = pd.concat((train_set,model))
train_set = train_set.sort_values(by=['Id'])

#merge test_set together
test_set = saved_models.pop(0)
for model in saved_models:
    test_set = pd.merge(test_set,model,on='Id')

In [0]:
#average predictions from the test predictions. Note that we can already submit the averaged value as final result
test_set.columns = ['Id'] + list(range(1,5))
test_set['mean'] = test_set.iloc[:,1:5].mean(axis=1)
test_set=test_set.drop(list(range(1,5)),axis=1)
test_set.columns = ['Id','SalePrice']

#read in labels
labels = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/house/saved_model/labels.csv')
features = train_set.drop(['Id'],axis=1)

In [7]:
#parameters search using RandomizedCV search
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
params = {
        'n_estimators': [400,500,600,700,900,1200,1500,1800,2000,2300,2500,3000,3200,3500,4000],
        'learning_rate': [0.1,0.01,0.02,0.001,0.005,0.0001,0.0005,0.00001],
        'max_depth': [8,9,10,15,20,30,40,50,60,70],
        'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
        'min_child_weight': [0.00001,0.0001,0.001,0.01,0.1,1,3,5,7],
        'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
}
xgb_model = XGBRegressor()
skf = KFold(n_splits=4)
skf.get_n_splits()

random_search = RandomizedSearchCV(xgb_model,
                                   param_distributions=params,
                                   n_iter=100, #for actual competition 2000 iterations were used
                                   scoring='neg_mean_squared_error',
                                   cv=skf.split(features,labels),
                                   verbose=3, 
                                   n_jobs=4)
random_search.fit(features,labels)
print(random_search.cv_results_)
print(random_search.best_estimator_.score(features,labels))
print(random_search.best_score_)
print(random_search.best_estimator_)

Fitting 4 folds for each of 100 candidates, totalling 400 fits
[CV] subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3 
[CV] subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3 
[CV] subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3 
[CV] subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3 
[CV]  subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3, score=-0.012182625822595902, total=   0.7s
[CV] subsample=0.4, n_estimators=500, min_child_weight=3, max_depth=8, learning_rate=1e-05, gamma=1.0, colsample_bytree=0.4 
[CV]  subsample=0.3, n_estimators=600, min_child_weight=5, max_depth=20, learning_rate=0.02, gamma=0.6, colsample_bytree=0.3, score=-0.01732572

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   10.1s


[CV] subsample=0.2, n_estimators=3200, min_child_weight=3, max_depth=50, learning_rate=0.001, gamma=0.8, colsample_bytree=0.6 
[CV]  subsample=0.2, n_estimators=600, min_child_weight=1, max_depth=50, learning_rate=0.005, gamma=0.4, colsample_bytree=0.4, score=-0.37882034931405073, total=   0.3s
[CV] subsample=0.2, n_estimators=3200, min_child_weight=3, max_depth=50, learning_rate=0.001, gamma=0.8, colsample_bytree=0.6 
[CV]  subsample=0.2, n_estimators=600, min_child_weight=1, max_depth=50, learning_rate=0.005, gamma=0.4, colsample_bytree=0.4, score=-0.3595303075008194, total=   0.3s
[CV] subsample=0.2, n_estimators=3200, min_child_weight=3, max_depth=50, learning_rate=0.001, gamma=0.8, colsample_bytree=0.6 
[CV]  subsample=0.2, n_estimators=3200, min_child_weight=3, max_depth=50, learning_rate=0.001, gamma=0.8, colsample_bytree=0.6, score=-0.26026855356034095, total=   1.4s
[CV] subsample=0.5, n_estimators=400, min_child_weight=5, max_depth=8, learning_rate=0.02, gamma=0.6, colsample_

[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   34.5s


[CV]  subsample=0.7, n_estimators=2300, min_child_weight=1e-05, max_depth=50, learning_rate=0.0005, gamma=0.9, colsample_bytree=0.2, score=-13.444411055701277, total=   0.9s
[CV] subsample=0.3, n_estimators=1800, min_child_weight=1, max_depth=15, learning_rate=0.02, gamma=0.9, colsample_bytree=0.5 
[CV]  subsample=0.7, n_estimators=2300, min_child_weight=1e-05, max_depth=50, learning_rate=0.0005, gamma=0.9, colsample_bytree=0.2, score=-13.369995893335119, total=   1.0s
[CV] subsample=0.3, n_estimators=1800, min_child_weight=1, max_depth=15, learning_rate=0.02, gamma=0.9, colsample_bytree=0.5 
[CV]  subsample=0.7, n_estimators=2300, min_child_weight=1e-05, max_depth=50, learning_rate=0.0005, gamma=0.9, colsample_bytree=0.2, score=-13.422591702115197, total=   1.0s
[CV] subsample=0.3, n_estimators=1800, min_child_weight=1, max_depth=15, learning_rate=0.02, gamma=0.9, colsample_bytree=0.5 
[CV]  subsample=0.7, n_estimators=2300, min_child_weight=1e-05, max_depth=50, learning_rate=0.0005, 

[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  2.1min


[CV]  subsample=1.0, n_estimators=900, min_child_weight=0.01, max_depth=20, learning_rate=1e-05, gamma=0.8, colsample_bytree=0.1, score=-130.15966855914334, total=   0.2s
[CV]  subsample=1.0, n_estimators=900, min_child_weight=0.01, max_depth=20, learning_rate=1e-05, gamma=0.8, colsample_bytree=0.1, score=-129.752699039863, total=   0.2s
[CV] subsample=0.6, n_estimators=3200, min_child_weight=0.01, max_depth=40, learning_rate=0.005, gamma=0.2, colsample_bytree=0.1 
[CV] subsample=0.6, n_estimators=3200, min_child_weight=0.01, max_depth=40, learning_rate=0.005, gamma=0.2, colsample_bytree=0.1 
[CV]  subsample=0.2, n_estimators=3000, min_child_weight=1e-05, max_depth=60, learning_rate=0.001, gamma=0.3, colsample_bytree=0.9, score=-0.3789671434049377, total=   1.2s
[CV] subsample=0.6, n_estimators=3200, min_child_weight=0.01, max_depth=40, learning_rate=0.005, gamma=0.2, colsample_bytree=0.1 
[CV]  subsample=0.2, n_estimators=3000, min_child_weight=1e-05, max_depth=60, learning_rate=0.001

[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:  3.4min finished


{'mean_fit_time': array([ 0.70704836,  0.13211155,  1.33824301,  4.23200905,  2.37812161,
        0.25954723,  0.26693404,  1.32417816,  0.38859153,  0.46243322,
        0.7253595 ,  0.99018824,  0.99319404,  0.90267414,  2.16184616,
        0.13023728,  0.48762727,  0.24977171,  0.20364207,  0.68284178,
        2.51600999,  2.3694548 ,  1.6055184 ,  0.10273194,  0.20021045,
        0.28056979,  0.82542533,  3.45188898,  0.30327839,  0.9482618 ,
        0.98082495,  2.97588122,  5.96701539,  0.61266774,  1.21473229,
        0.13877124,  0.24849683,  1.90599972,  1.68829709,  0.5671221 ,
        0.80885196,  1.39834195,  4.22376633,  0.50471395,  0.36124635,
        1.61075985,  0.29503042,  0.83133572,  0.30106896,  1.47002846,
        2.93531686,  0.59773672,  6.72512764,  0.29845524,  0.28014779,
        0.95962662,  0.74139839,  0.10520148,  0.90625536, 22.73215306,
        0.37139028,  0.80593407,  0.34682775,  0.67000163,  0.45382184,
        0.23147124,  1.52749628, 18.87439466, 

In [0]:
#pop Id and send results

index = test_set.pop('Id')
#don't forget to reserse log on the labels
submit_predictions = np.expm1(random_search.best_estimator_.predict(test_set))
submit = pd.DataFrame(data={'Id':index,'SalePrice':submit_predictions})
submit.to_csv('/content/gdrive/My Drive/Colab Notebooks/house/submit.csv',index=False)