# Import

In [9]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [10]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [11]:
# Data Reading
df_train = pd.read_csv('./data/train.csv', index_col='Id')
df_test = pd.read_csv('./data/test.csv', index_col='Id') 

# Split X_train, y_train
target = 'SalePrice'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Preprocessing

In [12]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing import *
from robusta.compose import *

data_prep = make_pipeline(
    ColumnSelector(columns=features),
    FeatureUnion([
        ("numeric", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            GaussRank(),
        )),
        ("category", make_pipeline(
            TypeSelector("object"),
            LabelEncoder(dtype='category'),
        )),
    ])
)

X_train = data_prep.fit_transform(df_train)
X_test = data_prep.transform(df_test)

X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.233308,-0.25767,-0.273117,0.443482,-0.28864,0.5848,0.423591,0.572076,0.471615,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
2,-0.654032,0.551737,0.045557,-0.005474,1.248297,0.054685,-0.268858,-0.374993,0.784685,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
3,0.233308,-0.183035,0.412563,0.443482,-0.28864,0.494613,0.353446,0.471615,0.121388,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4
4,0.487486,-0.460667,0.018212,0.443482,-0.28864,-1.205004,-0.396922,-0.374993,-0.181024,-0.101106,...,5,2,4,4,2,-1,-1,-1,8,0
5,0.233308,0.660525,0.895264,0.913711,-0.28864,0.461053,0.256459,0.964415,0.371504,-0.101106,...,1,1,4,4,2,-1,-1,-1,8,4


# Fold Preparation

In [13]:
fold_prep = FeatureUnion([
    ('category', make_pipeline(
        TypeSelector(['category', 'object']), 
        TargetEncoderCV(cv=4, smoothing=1e3))),
    ('numeric', TypeSelector(np.number))
])
    
fold_prep.fit_transform(X_train, y_train).sample(5, random_state=555)

Unnamed: 0_level_0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1349,183953.688109,186675.925275,184442.902464,253859.932039,234493.847826,185408.265525,203424.272727,186813.734574,185305.07211,181654.209797,...,0.172882,1.365757,0.003642,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,0.524039,-0.361037
1110,183953.688109,186675.925275,184442.902464,195923.597701,234493.847826,185408.265525,324834.451613,186813.734574,185305.07211,181654.209797,...,1.202419,0.270502,1.197297,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
1354,183953.688109,186675.925275,184442.902464,205459.907975,234493.847826,185408.265525,203424.272727,186813.734574,185305.07211,181654.209797,...,0.71039,0.980035,0.612598,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,4.320005
224,182198.659512,184533.832237,182232.582653,163965.676136,161265.740741,161980.263158,139922.8,184759.723902,182696.907545,179667.694931,...,0.461804,0.726666,-0.534159,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.922824,0.596492
907,184802.372683,186463.040087,184707.416331,165499.046639,240523.677215,185615.780359,203001.598234,187151.016553,185902.971549,182404.291782,...,1.220863,-0.452825,0.396122,-0.126319,-0.013962,-0.069919,-0.003642,-0.030967,-0.053148,-0.361037


In [14]:
from lightgbm import LGBMRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import *

def rmsle_score(y_true, y_pred): 
    return -np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmsle_scorer(estimator, X_test, y_test): 
    return rmsle_score(y_test, estimator.predict(X_test))

cv = 5
scoring = rmsle_scorer



#gs = GridSearchCV(model_pipe_te, {param: param_space}, cv=cv, scoring=scoring)
#gs.fit(X_train, y_train)


model = TransformedTargetRegressor(
    func=np.log1p, 
    inverse_func=np.expm1,
    regressor = make_pipeline(
        FeatureUnion([
            ('category', make_pipeline(
                TypeSelector(['category', 'object']), 
                TargetEncoderCV(cv=4, smoothing=1e3)
            )),
            ('numeric', make_pipeline(
                TypeSelector(np.number)
            )),
        ]), 
        LGBMRegressor()
    ),
)

scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)

print('{:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

-0.1302 ± 0.0104


In [22]:
param = 'encodercv__encoder__smoothing'
a, b = 0, 10
param_space = np.logspace(a, b, int(b-a+1)) + 1e-9

gs = GridSearchCV(model, {param: param_space}, cv=cv, scoring=scoring)
gs.fit(X_train, y_train)

ValueError: Invalid parameter encodercv for estimator TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
              regressor=Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(transformers=[('category', Pipeline(memory=None,
     steps=[('typeselector', TypeSelector(dtype=['category', 'object'])), ('encodercv', EncoderCV(cv=4,
     encoder=TargetEncoder(cols=None, drop_invariant=False, handle_missing='value',
       handle_unknown='val....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])). Check the list of available parameters with `estimator.get_params().keys()`.

In [16]:
model.get_params()

{'func': <ufunc 'log1p'>,
 'inverse_func': <ufunc 'expm1'>,
 'regressor__memory': None,
 'regressor__steps': [('featureunion',
   FeatureUnion(transformers=[('category', Pipeline(memory=None,
        steps=[('typeselector', TypeSelector(dtype=['category', 'object'])), ('encodercv', EncoderCV(cv=4,
        encoder=TargetEncoder(cols=None, drop_invariant=False, handle_missing='value',
          handle_unknown='value', min_samples_leaf=1, return_df=True,
          smoothing=1.0, verbose=0),
        n_jobs=-1, verbose=0))])), ('numeric', Pipeline(memory=None,
        steps=[('typeselector', TypeSelector(dtype=<class 'numpy.number'>))]))])),
  ('lgbmregressor',
   LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
          importance_type='split', learning_rate=0.1, max_depth=-1,
          min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
          n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
          random_state=None, reg_alpha=0.