## Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

## Data Reading

In [3]:
# Data Reading
df_train = pd.read_csv('./data/train.csv', index_col='Id')
df_test = pd.read_csv('./data/test.csv', index_col='Id') 

# Split X_train, y_train
target = 'SalePrice'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Preprocessing

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.basic import *
from robusta.pipeline import *

data_prep = make_pipeline(
    ColumnSelector(columns=features),
    FeatureUnion([
        ("numeric", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            GaussRank(),
        )),
        ("category", make_pipeline(
            TypeSelector("object"),
            LabelEncoder(dtype='category'),
        )),
    ])
)

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.233308,-0.25767,-0.273117,0.443482,-0.28864,0.5848,0.423591,0.572076,0.471615,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
2,-0.654032,0.551737,0.045557,-0.005474,1.248297,0.054685,-0.268858,-0.374993,0.784685,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
3,0.233308,-0.183035,0.412563,0.443482,-0.28864,0.494613,0.353446,0.471615,0.121388,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
4,0.487486,-0.460667,0.018212,0.443482,-0.28864,-1.205004,-0.396922,-0.374993,-0.181024,-0.101106,...,5,2,4,4,2,-1,-1,-1,8,0
5,0.233308,0.660525,0.895264,0.913711,-0.28864,0.461053,0.256459,0.964415,0.371504,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4


## Fold Preparation

In [5]:
fold_prep = FeatureUnion([
    ('category', make_pipeline(
        TypeSelector(['category', 'object']), 
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=2.0),
    )),
    ('numeric', make_pipeline(
        TypeSelector(np.number),
        Identity(),
    )),
])
    
F_train = fold_prep.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0_level_0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1349,183953.688109,186675.925275,184442.902464,253859.932039,234493.847826,185408.265525,203424.272727,186813.734574,185305.07211,181654.209797,...,0.172882,1.365757,0.003642,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,0.524039,-0.361037
1110,183953.688109,186675.925275,184442.902464,195923.597701,234493.847826,185408.265525,324834.451613,186813.734574,185305.07211,181654.209797,...,1.202419,0.270502,1.197297,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
1354,183953.688109,186675.925275,184442.902464,205459.907975,234493.847826,185408.265525,203424.272727,186813.734574,185305.07211,181654.209797,...,0.71039,0.980035,0.612598,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,4.320005
224,182198.659512,184533.832237,182232.582653,163965.676136,161265.740741,161980.263321,139922.8,184759.723902,182696.907545,179667.694931,...,0.461804,0.726666,-0.534159,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
907,184802.372683,186463.040087,184707.416331,165499.046639,240523.677215,185615.780359,203001.598234,187151.016553,185902.971549,182404.291782,...,1.220863,-0.452825,0.396122,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.053148,-0.361037


## Estimator

In [6]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [7]:
%%time
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import *

def rmsle_score(y_true, y_pred): 
    return -np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmsle_scorer(estimator, X_test, y_test): 
    return rmsle_score(y_test, estimator.predict(X_test))

cv = 5
metric = rmsle_scorer

estimator = TransformedTargetRegressor(
    func=np.log1p, 
    inverse_func=np.expm1,
    regressor = make_pipeline(fold_prep, model)
)

scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)

print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

-0.1308 ± 0.0102
CPU times: user 22.8 s, sys: 3.49 s, total: 26.3 s
Wall time: 17.8 s


## (Fold) Optimizer

In [8]:
%%time
from sklearn.model_selection import GridSearchCV

param = 'regressor__featureunion__category__encodercv__encoder__smoothing'
param_space = [1, 10, 50, 100, 200, 500]
space = {param: param_space}

gs = GridSearchCV(estimator, space, cv=cv, scoring=metric)
gs.fit(X_train, y_train)

CPU times: user 2min 59s, sys: 27.9 s, total: 3min 27s
Wall time: 2min 6s


In [9]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__featureunion__category__encodercv__encoder__smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.820055,0.172107,0.841537,0.016923,1,{'regressor__featureunion__category__encodercv...,-0.115706,-0.142783,-0.134651,-0.120078,...,-0.13017,0.010449,5,-0.038712,-0.036666,-0.038894,-0.037812,-0.039024,-0.038222,0.000886
1,2.633279,0.198168,0.807794,0.046326,10,{'regressor__featureunion__category__encodercv...,-0.116783,-0.143423,-0.133089,-0.120044,...,-0.129149,0.009651,1,-0.040168,-0.035828,-0.037612,-0.038226,-0.039599,-0.038287,0.001534
2,2.834157,0.309549,0.884599,0.08614,50,{'regressor__featureunion__category__encodercv...,-0.117285,-0.145293,-0.13275,-0.11862,...,-0.129386,0.010389,2,-0.038823,-0.036327,-0.03901,-0.038076,-0.038912,-0.03823,0.001007
3,2.776763,0.167068,0.796444,0.065284,100,{'regressor__featureunion__category__encodercv...,-0.117341,-0.145663,-0.132451,-0.118894,...,-0.130253,0.010793,6,-0.039569,-0.035597,-0.03853,-0.037972,-0.039112,-0.038156,0.001388
4,2.4129,0.510272,0.781818,0.148568,200,{'regressor__featureunion__category__encodercv...,-0.11907,-0.146476,-0.132454,-0.118213,...,-0.130073,0.010518,4,-0.039439,-0.035405,-0.038556,-0.03834,-0.03842,-0.038032,0.001371
5,1.722727,0.007755,0.604785,0.006214,500,{'regressor__featureunion__category__encodercv...,-0.117442,-0.143447,-0.130749,-0.120457,...,-0.12973,0.009724,3,-0.03875,-0.035923,-0.038892,-0.038042,-0.039007,-0.038123,0.00115


In [10]:
best_estimator = gs.best_estimator_

scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

scores = cross_val_score(best_estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

-0.1308 ± 0.0102
-0.1291 ± 0.0097


## Submit (Out-of-Fold & Test Predictions)

In [11]:
%%time
from sklearn.model_selection import cross_val_predict

y_oof = cross_val_predict(best_estimator, X_train, y_train, cv=cv)

y_oof = pd.Series(y_oof, index=X_train.index, name=target)
print(y_oof.head())

Id
1    207574.660506
2    171839.035598
3    208177.630559
4    182846.122412
5    305076.218709
Name: SalePrice, dtype: float64
CPU times: user 21.8 s, sys: 1.98 s, total: 23.8 s
Wall time: 11.9 s


In [12]:
%%time
from sklearn.model_selection import cross_validate

scores = cross_validate(best_estimator, X_train, y_train, cv=cv, return_estimator=True)
y_subs = [estimator.predict(X_test) for estimator in scores['estimator']]
y_sub = np.mean(y_subs, axis=0)

y_sub = pd.Series(y_sub, index=X_test.index, name=target)
print(y_sub.head())

Id
1461    127801.291361
1462    179115.320770
1463    191076.047284
1464    197249.579888
1465    184163.294438
Name: SalePrice, dtype: float64
CPU times: user 37.1 s, sys: 4.08 s, total: 41.1 s
Wall time: 18.8 s


In [13]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '0 sub baseline.csv')
oof_path = os.path.join(path, '0 oof baseline.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)