## Imports

In [1]:
# Basic imports
import pandas as pd
import numpy as np

# Warnings
import warnings 
warnings.simplefilter("ignore")

In [2]:
# Plot
from IPython import display
import seaborn as sns

import matplotlib
import matplotlib.pylab as plt
from jupyterthemes import jtplot

jtplot.style('gruvboxd')
matplotlib.use('nbagg')

# Data Reading

In [3]:
# Data Reading
df_train = pd.read_csv('./data/train.csv', index_col='Id')
df_test = pd.read_csv('./data/test.csv', index_col='Id') 

# Split X_train, y_train
target = 'SalePrice'
features = df_test.columns

y_train = df_train[target]
df_train = df_train[features]

df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Data Preparation

## Categorical Features Pipeline

In [4]:
from robusta.preprocessing.category import *
from robusta.preprocessing import *
from robusta.compose import *

# Categorical Features Selector
cats = TypeSelector(dtype='object')

# Label Encoder
le = LabelEncoder() # has inbuilt NaN's preprocessing

# Add Prefix
cr = ColumnRenamer(prefix='le_')

# Pipeline
cat_steps = [('cats', cats), ('le', le), ('cr', cr)]
cat_pipe = Pipeline(cat_steps)

cat_pipe.fit(df_train, y_train).transform(df_test).head()

Unnamed: 0_level_0,le_MSZoning,le_Street,le_Alley,le_LotShape,le_LandContour,le_Utilities,le_LotConfig,le_LandSlope,le_Neighborhood,le_Condition1,...,le_GarageType,le_GarageFinish,le_GarageQual,le_GarageCond,le_PavedDrive,le_PoolQC,le_Fence,le_MiscFeature,le_SaleType,le_SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,2,1,255,3,3,0,4,0,12,1,...,1,2,4,4,2,255,2,255,8,4
1462,3,1,255,0,3,0,0,0,12,2,...,1,2,4,4,2,255,255,0,8,4
1463,3,1,255,0,3,0,4,0,8,2,...,1,0,4,4,2,255,2,255,8,4
1464,3,1,255,0,3,0,4,0,8,2,...,1,0,4,4,2,255,255,255,8,4
1465,3,1,255,0,1,0,4,0,22,2,...,1,1,4,4,2,255,255,255,8,4


## Numeric Features Pipeline

In [5]:
from robusta.preprocessing.numeric import GaussRank
from robusta.preprocessing import Imputer

# Numeric Features Selector
nums = TypeSelector(dtype=np.number)

# Fill NaN's
imp = Imputer(missing_values=np.nan, strategy='median')

# GaussRank Transformer
gr = GaussRank()

# Add Prefix
cr = ColumnRenamer(prefix='gr_')

# Pipeline
num_steps = [('nums', nums), ('imp', imp), ('gr', gr), ('cr', cr)]
num_pipe = Pipeline(num_steps)

num_pipe.fit(df_train, y_train).transform(df_test).head()

Unnamed: 0_level_0,gr_MSSubClass,gr_LotFrontage,gr_LotArea,gr_OverallQual,gr_OverallCond,gr_YearBuilt,gr_YearRemodAdd,gr_MasVnrArea,gr_BsmtFinSF1,gr_BsmtFinSF2,...,gr_GarageArea,gr_WoodDeckSF,gr_OpenPorchSF,gr_EnclosedPorch,gr_3SsnPorch,gr_ScreenPorch,gr_PoolArea,gr_MiscVal,gr_MoSold,gr_YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,-0.647676,0.561674,0.488824,-0.490783,0.405005,-0.263711,-0.533227,-0.358596,0.129493,0.910074,...,0.857042,0.313704,-0.545172,-0.153042,-0.007289,1.010849,-0.003037,-0.03038,-0.006263,4.320005
1462,-0.647676,0.63103,0.908685,0.023743,0.405005,-0.342367,-0.611973,0.348272,0.737771,-0.10916,...,-0.498901,1.321261,0.117774,-0.153042,-0.007289,-0.084633,-0.003037,2.263257,-0.006263,4.320005
1463,0.217984,0.338022,0.85831,-0.490783,-0.297887,0.349904,0.186964,-0.358596,0.556692,-0.10916,...,0.041937,0.748322,0.08341,-0.153042,-0.007289,-0.084633,-0.003037,-0.03038,-0.790484,4.320005
1464,0.217984,0.47351,0.151177,0.023743,0.405005,0.379062,0.186964,0.215906,0.300355,-0.10916,...,-0.039503,1.272316,0.117774,-0.153042,-0.007289,-0.084633,-0.003037,-0.03038,-0.006263,4.320005
1465,0.926003,-0.95803,-0.87114,0.862554,-0.297887,0.231396,-0.00427,-0.358596,-0.107931,-0.10916,...,0.166751,-0.451691,0.550091,-0.153042,-0.007289,1.068097,-0.003037,-0.03038,-1.722679,4.320005


## Cat. & Num. Pipelines Union

In [6]:
transformers = [
    ('nums', num_pipe),
    ('cats', cat_pipe),
]

fu = FeatureUnion(transformers)
fu.fit(df_train, y_train).transform(df_test).head()

Unnamed: 0_level_0,gr_MSSubClass,gr_LotFrontage,gr_LotArea,gr_OverallQual,gr_OverallCond,gr_YearBuilt,gr_YearRemodAdd,gr_MasVnrArea,gr_BsmtFinSF1,gr_BsmtFinSF2,...,le_GarageType,le_GarageFinish,le_GarageQual,le_GarageCond,le_PavedDrive,le_PoolQC,le_Fence,le_MiscFeature,le_SaleType,le_SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,-0.647676,0.561674,0.488824,-0.490783,0.405005,-0.263711,-0.533227,-0.358596,0.129493,0.910074,...,1,2,4,4,2,255,2,255,8,4
1462,-0.647676,0.63103,0.908685,0.023743,0.405005,-0.342367,-0.611973,0.348272,0.737771,-0.10916,...,1,2,4,4,2,255,255,0,8,4
1463,0.217984,0.338022,0.85831,-0.490783,-0.297887,0.349904,0.186964,-0.358596,0.556692,-0.10916,...,1,0,4,4,2,255,2,255,8,4
1464,0.217984,0.47351,0.151177,0.023743,0.405005,0.379062,0.186964,0.215906,0.300355,-0.10916,...,1,0,4,4,2,255,255,255,8,4
1465,0.926003,-0.95803,-0.87114,0.862554,-0.297887,0.231396,-0.00427,-0.358596,-0.107931,-0.10916,...,1,1,4,4,2,255,255,255,8,4


# Final Preprocessing Pipeline

In [8]:
from robusta.preprocessing.category import *
from robusta.preprocessing.numeric import *
from robusta.preprocessing import *
from robusta.compose import *

prep_pipe = make_pipeline(
    ColumnSelector(columns=features),
    FeatureUnion([
        ("numeric", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            GaussRank(),
            ColumnRenamer(prefix='gr_'),
        )),
        ("category", make_pipeline(
            TypeSelector("object"),
            LabelEncoder(),
            ColumnRenamer(prefix='le_'),
        )),
    ])
)

prep_pipe.fit_transform(df_train).head()

Unnamed: 0_level_0,gr_MSSubClass,gr_LotFrontage,gr_LotArea,gr_OverallQual,gr_OverallCond,gr_YearBuilt,gr_YearRemodAdd,gr_MasVnrArea,gr_BsmtFinSF1,gr_BsmtFinSF2,...,le_GarageType,le_GarageFinish,le_GarageQual,le_GarageCond,le_PavedDrive,le_PoolQC,le_Fence,le_MiscFeature,le_SaleType,le_SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.233308,-0.25767,-0.273117,0.443482,-0.28864,0.5848,0.423591,0.572076,0.471615,-0.101106,...,1,1,4,4,2,255,255,255,8,4
2,-0.654032,0.551737,0.045557,-0.005474,1.248297,0.054685,-0.268858,-0.374993,0.784685,-0.101106,...,1,1,4,4,2,255,255,255,8,4
3,0.233308,-0.183035,0.412563,0.443482,-0.28864,0.494613,0.353446,0.471615,0.121388,-0.101106,...,1,1,4,4,2,255,255,255,8,4
4,0.487486,-0.460667,0.018212,0.443482,-0.28864,-1.205004,-0.396922,-0.374993,-0.181024,-0.101106,...,5,2,4,4,2,255,255,255,8,0
5,0.233308,0.660525,0.895264,0.913711,-0.28864,0.461053,0.256459,0.964415,0.371504,-0.101106,...,1,1,4,4,2,255,255,255,8,4


In [9]:
# Check equality
X_test1 = fu.fit(df_train, y_train).transform(df_test)
X_test2 = prep_pipe.fit(df_train, y_train).transform(df_test)

X_test1.equals(X_test2)

True