# REF_house-prices-solution-0-107-lb

Reference:
- https://www.kaggle.com/jesucristo/house-prices-solution-0-107-lb

## Import PKGs

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Folders

In [2]:
CWD = os.getcwd()
input_folder = os.path.join(CWD, 'input')
output_folder = os.path.join(CWD, 'output')
model_folder = os.path.join(CWD, 'model')

In [45]:
train_csv = os.path.join(input_folder, 'train.csv')
test_csv = os.path.join(input_folder, 'test.csv')
sample_submission_csv = os.path.join(input_folder, 'sample_submission.csv')
submission_csv = os.path.join(output_folder, 'submission.csv')

print(train_csv)
print(test_csv)
print(sample_submission_csv)
print(submission_csv)

D:\Kaggle\house-prices-advanced-regression-techniques\input\train.csv
D:\Kaggle\house-prices-advanced-regression-techniques\input\test.csv
D:\Kaggle\house-prices-advanced-regression-techniques\input\sample_submission.csv
D:\Kaggle\house-prices-advanced-regression-techniques\output\submission.csv


## Load Data

In [8]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

display(train_df.shape, train_df.head(2))
display(test_df.shape, test_df.head(2))
# display(train_df.columns)
# display(test_df.columns)

(1460, 81)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


(1459, 80)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal


In [9]:
train_df.drop(['Id'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

In [12]:
train_df = train_df[train_df.GrLivArea < 4500]
train_df.reset_index(drop=True, inplace=True)
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
y_train = train_df['SalePrice'].reset_index(drop=True)

## Features

In [15]:
x_train_df = train_df.drop(['SalePrice'], axis=1)
x_test_df = test_df
x_data_df = pd.concat([x_train_df, x_test_df]).reset_index(drop=True)
display(x_data_df.shape)

(2917, 79)

In [17]:
x_data_df['MSSubClass'] = x_data_df['MSSubClass'].apply(str)
x_data_df['YrSold'] = x_data_df['YrSold'].astype(str)
x_data_df['MoSold'] = x_data_df['MoSold'].astype(str)
x_data_df['Functional'] = x_data_df['Functional'].fillna('Typ') 
x_data_df['Electrical'] = x_data_df['Electrical'].fillna("SBrkr") 
x_data_df['KitchenQual'] = x_data_df['KitchenQual'].fillna("TA") 
x_data_df["PoolQC"] = x_data_df["PoolQC"].fillna("None")
x_data_df['Exterior1st'] = x_data_df['Exterior1st'].fillna(x_data_df['Exterior1st'].mode()[0]) 
x_data_df['Exterior2nd'] = x_data_df['Exterior2nd'].fillna(x_data_df['Exterior2nd'].mode()[0])
x_data_df['SaleType'] = x_data_df['SaleType'].fillna(x_data_df['SaleType'].mode()[0])

In [18]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    x_data_df[col] = x_data_df[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    x_data_df[col] = x_data_df[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    x_data_df[col] = x_data_df[col].fillna('None')

In [19]:
x_data_df['MSZoning'] = x_data_df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [20]:
objects = []
for i in x_data_df.columns:
    if x_data_df[i].dtype == object:
        objects.append(i)
x_data_df.update(x_data_df[objects].fillna('None'))

x_data_df['LotFrontage'] = x_data_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in x_data_df.columns:
    if x_data_df[i].dtype in numeric_dtypes:
        numerics.append(i)
x_data_df.update(x_data_df[numerics].fillna(0))

In [21]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in x_data_df.columns:
    if x_data_df[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_x_data_df = x_data_df[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_x_data_df[skew_x_data_df > 0.5]
skew_index = high_skew.index

for i in skew_index:
    x_data_df[i] = boxcox1p(x_data_df[i], boxcox_normmax(x_data_df[i] + 1))

In [22]:
x_data_df = x_data_df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

x_data_df['YrBltAndRemod']=x_data_df['YearBuilt']+x_data_df['YearRemodAdd']
x_data_df['TotalSF']=x_data_df['TotalBsmtSF'] + x_data_df['1stFlrSF'] + x_data_df['2ndFlrSF']

x_data_df['Total_sqr_footage'] = (x_data_df['BsmtFinSF1'] + x_data_df['BsmtFinSF2'] +
                                 x_data_df['1stFlrSF'] + x_data_df['2ndFlrSF'])

x_data_df['Total_Bathrooms'] = (x_data_df['FullBath'] + (0.5 * x_data_df['HalfBath']) +
                               x_data_df['BsmtFullBath'] + (0.5 * x_data_df['BsmtHalfBath']))

x_data_df['Total_porch_sf'] = (x_data_df['OpenPorchSF'] + x_data_df['3SsnPorch'] +
                              x_data_df['EnclosedPorch'] + x_data_df['ScreenPorch'] +
                              x_data_df['WoodDeckSF'])

In [23]:
x_data_df['haspool'] = x_data_df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
x_data_df['has2ndfloor'] = x_data_df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
x_data_df['hasgarage'] = x_data_df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
x_data_df['hasbsmt'] = x_data_df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
x_data_df['hasfireplace'] = x_data_df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [24]:
x_data_dum_df = pd.get_dummies(x_data_df).reset_index(drop=True)
display(x_data_dum_df.shape)

(2917, 333)

In [26]:
# x_train = x_data_dum_df.iloc[:len(y_train), :]
# x_test = x_data_dum_df.iloc[len(y_train):, :]
# display(x_train.shape, y_train.shape, x_test.shape)

In [30]:
y = y_train
X = x_data_dum_df.iloc[:len(y), :]
X_sub = x_data_dum_df.iloc[len(y):, :]
display(X.shape, y.shape, X_sub.shape)

(1458, 333)

(1458,)

(1459, 333)

In [31]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_sub = X_sub.drop(overfit, axis=1)

display(X.shape, y_train.shape, X_sub.shape)

(1453, 332)

(1458,)

(1459, 332)

In [32]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [33]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [34]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [35]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)

In [36]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [37]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [38]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [39]:
%%time
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

LASSO: 0.0078 (0.0011)
 2019-03-14 21:56:42.011617
elastic net: 0.0079 (0.0011)
 2019-03-14 21:56:57.277737
SVR: 0.0091 (0.0010)
 2019-03-14 21:57:01.912739
lightgbm: 0.0083 (0.0013)
 2019-03-14 21:57:23.280735
gbr: 0.0084 (0.0012)
 2019-03-14 21:58:49.745783
xgboost: 0.0081 (0.0012)
 2019-03-14 22:01:34.176226


In [40]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

START Fit
stack_gen
elasticnet
Lasso
Ridge
Svr
GradientBoosting
xgboost
lightgbm


## Blending Models

In [41]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [42]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

RMSLE score on train data:
0.004271124624362906


In [44]:
print('Predict submission')
submission = pd.read_csv(sample_submission_csv)
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(X_sub)))

Predict submission


In [None]:
# print('Blend with Top Kernels submissions\n')
# sub_1 = pd.read_csv('../input/top-10-0-10943-stacking-mice-and-brutal-force/House_Prices_submit.csv')
# sub_2 = pd.read_csv('../input/hybrid-svm-benchmark-approach-0-11180-lb-top-2/hybrid_solution.csv')
# sub_3 = pd.read_csv('../input/lasso-model-for-regression-problem/lasso_sol22_Median.csv')
# submission.iloc[:,1] = np.floor((0.25 * np.floor(np.expm1(blend_models_predict(X_sub)))) + 
#                                 (0.25 * sub_1.iloc[:,1]) + 
#                                 (0.25 * sub_2.iloc[:,1]) + 
#                                 (0.25 * sub_3.iloc[:,1]))

## Submission

In [47]:
q1 = submission['SalePrice'].quantile(0.005)
q2 = submission['SalePrice'].quantile(0.995)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
print(submission_csv)
submission.to_csv(submission_csv, index=False)

D:\Kaggle\house-prices-advanced-regression-techniques\output\submission.csv
