## Imports

In [2]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [3]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

## Data Reading

In [4]:
# Data Reading
df_train = pd.read_csv('./data/train.csv', index_col='Id')
df_test = pd.read_csv('./data/test.csv', index_col='Id') 

# Split X_train, y_train
target = 'SalePrice'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


## Preprocessing

In [6]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing.base import *
from robusta.pipeline import *

data_prep = make_pipeline(
    ColumnSelector(columns=features),
    FeatureUnion([
        ("numeric", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            GaussRank(),
        )),
        ("category", make_pipeline(
            TypeSelector("object"),
            LabelEncoder(dtype='category'),
        )),
    ])
)

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.233308,-0.25767,-0.273117,0.443482,-0.28864,0.5848,0.423591,0.572076,0.471615,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
2,-0.654032,0.551737,0.045557,-0.005474,1.248297,0.054685,-0.268858,-0.374993,0.784685,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
3,0.233308,-0.183035,0.412563,0.443482,-0.28864,0.494613,0.353446,0.471615,0.121388,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
4,0.487486,-0.460667,0.018212,0.443482,-0.28864,-1.205004,-0.396922,-0.374993,-0.181024,-0.101106,...,5,2,4,4,2,-1,-1,-1,8,0
5,0.233308,0.660525,0.895264,0.913711,-0.28864,0.461053,0.256459,0.964415,0.371504,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4


## Fold Preparation

In [11]:
encoder = FeatureUnion([
    ('category', make_pipeline(
        TypeSelector(['category', 'object']), 
        TargetEncoderCV(cv=4).set_params(encoder__smoothing=200.0),
    )),
    ('numeric', make_pipeline(
        TypeSelector(np.number),
        Identity(),
    )),
])
    
F_train = encoder.fit_transform(X_train, y_train)

F_train.sample(5, random_state=555)

Unnamed: 0_level_0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1349,183938.52296,186620.358182,184419.524921,226668.983751,225610.114455,185370.752283,201523.122866,186780.85653,185270.115755,181653.019401,...,0.172882,1.365757,0.003642,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,0.524039,-0.361037
1110,183938.52296,186620.358182,184419.524921,190194.789892,225610.114455,185370.752283,269322.761367,186780.85653,185270.115755,181653.019401,...,1.202419,0.270502,1.197297,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
1354,183938.52296,186620.358182,184419.524921,198048.438473,225610.114455,185370.752283,201523.122866,186780.85653,185270.115755,181653.019401,...,0.71039,0.980035,0.612598,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,4.320005
224,182183.732645,184483.389451,182213.665384,164419.809957,166956.78993,170016.54147,143096.095745,184729.582975,182669.768793,179667.775347,...,0.461804,0.726666,-0.534159,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
907,184785.288934,186418.338854,184687.9539,165919.275727,230473.994424,185583.502581,201010.211373,187120.287629,185868.524236,182402.165108,...,1.220863,-0.452825,0.396122,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.053148,-0.361037


## Model

In [12]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()

In [14]:
%%time
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import *

def rmsle_score(y_true, y_pred): 
    return -np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmsle_scorer(estimator, X_test, y_test): 
    return rmsle_score(y_test, estimator.predict(X_test))

cv = 5
metric = rmsle_scorer

trans_target = lambda reg: TransformedTargetRegressor(reg, np.log1p, np.expm1)


estimator = make_pipeline(model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f} [model]'.format(np.mean(scores), np.std(scores)))

estimator = make_pipeline(encoder, model)
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f} [encoder + model]'.format(np.mean(scores), np.std(scores)))

estimator = transform_target(make_pipeline(model))
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f} [trans + model]'.format(np.mean(scores), np.std(scores)))

estimator = transform_target(make_pipeline(encoder, model))
scores = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=metric)
print('{:.4f} ± {:.4f} [trans + encoder + model]'.format(np.mean(scores), np.std(scores)))

-0.1315 ± 0.0111 [model]
-0.1320 ± 0.0120 [encoder + model]
-0.1280 ± 0.0080 [transformer + model]
-0.1301 ± 0.0105 [transformer + encoder + model]
CPU times: user 1min 6s, sys: 5.53 s, total: 1min 11s
Wall time: 27.3 s


In [20]:
best_estimator = transform_target(make_pipeline(model))

## Submit (Out-of-Fold & Test Predictions)

In [21]:
%%time
from sklearn.model_selection import cross_val_predict

y_oof = cross_val_predict(best_estimator, X_train, y_train, cv=cv)

y_oof = pd.Series(y_oof, index=X_train.index, name=target)
print(y_oof.head())

Id
1    206965.091086
2    171002.161768
3    212944.506762
4    192282.336437
5    309558.297202
Name: SalePrice, dtype: float64
CPU times: user 11.5 s, sys: 545 ms, total: 12 s
Wall time: 1.64 s


In [22]:
%%time
from sklearn.model_selection import cross_validate

scores = cross_validate(best_estimator, X_train, y_train, cv=cv, return_estimator=True)
y_subs = [estimator.predict(X_test) for estimator in scores['estimator']]
y_sub = np.mean(y_subs, axis=0)

y_sub = pd.Series(y_sub, index=X_test.index, name=target)
print(y_sub.head())

Id
1461    127469.388269
1462    171951.489751
1463    191727.807793
1464    193514.060950
1465    184373.451800
Name: SalePrice, dtype: float64
CPU times: user 17.6 s, sys: 788 ms, total: 18.3 s
Wall time: 2.45 s


In [23]:
import os

path = os.path.join(os.getcwd(), 'pred')
if not os.path.exists(path):
    os.mkdir(path)

sub_path = os.path.join(path, '0 sub baseline.csv')
oof_path = os.path.join(path, '0 oof baseline.csv')

y_sub.to_csv(sub_path, header=True)
y_oof.to_csv(oof_path, header=True)

### Score:
### `[CV] 0.1280 ± 0.0080`
### `[LB] 0.1373`