In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV

import joblib

from feature_engine.imputation import AddMissingIndicator,MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder,OrdinalEncoder
from feature_engine.creation import RelativeFeatures
from feature_engine.transformation import LogTransformer, YeoJohnsonTransformer
from feature_engine.wrappers import SklearnTransformerWrapper
from preprocessors import Mapper

In [21]:
data = pd.read_csv('../data/train.csv')
data.drop('Id', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data.drop(['SalePrice'], axis=1), data['SalePrice'], test_size=0.1, random_state=0,)
y_train = np.log(y_train)
X_train[['MSSubClass']] = X_train[['MSSubClass']].astype('O')
y_test = np.log(y_test)
X_test[['MSSubClass']] = X_test[['MSSubClass']].astype('O')

In [22]:
MISSING=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
FREQUENT = ['MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond']
MISSING_INDICATOR = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
MEAN = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
TEMPORAL_VARIABLE = ['YrSold']
TEMPORAL_REFERENCE = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]
YEO_JOHNSON_VAR=  ['LotArea']
SKEWED=['BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal']
QUAL_MAPPINGS = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}
QUAL_VARS = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond','HeatingQC', 
            'KitchenQual', 'FireplaceQu','GarageQual', 'GarageCond',]
EXPOSURE_MAPPINGS = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
EXPOSURE_VAR= ['BsmtExposure']
FINISH_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
FINISH_VARS = ['BsmtFinType1', 'BsmtFinType2']
GARAGE_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
GARAGE_VAR = ['GarageFinish']
FENCE_MAPPINGS = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
FENCE_VAR = ['Fence']
RARE_LABELS = ['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'CentralAir',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'PoolQC',
 'MiscFeature',
 'SaleType',
 'SaleCondition',
 'MSSubClass']


In [23]:
pipe = Pipeline([('missing_imputer', CategoricalImputer(imputation_method='missing', variables=MISSING)),
                ('frequent_inputer', CategoricalImputer(imputation_method='frequent', variables=FREQUENT)),
                ('missing_indicator', AddMissingIndicator(variables=MISSING_INDICATOR)),
                ('mean_imputer', MeanMedianImputer(variables=MEAN)),
                ('temporal_feature', RelativeFeatures(variables=TEMPORAL_VARIABLE,reference=TEMPORAL_REFERENCE,func=['sub'])),
                ('log_transformer', LogTransformer(variables=LOG_VARS)),
                ('yeo_johnson_transformer', YeoJohnsonTransformer(variables=YEO_JOHNSON_VAR)),
                ('binarizer', SklearnTransformerWrapper(transformer=Binarizer(threshold=0), variables=SKEWED)),
                ('qual_mapper', Mapper(variables=QUAL_VARS,mappings=QUAL_MAPPINGS)),
                ('exposure_mapper', Mapper(variables=EXPOSURE_VAR,mappings=EXPOSURE_MAPPINGS)),
                ('finish_mapper', Mapper(variables=FINISH_VARS,mappings=FINISH_MAPPINGS)),
                ('garage_mapper', Mapper(variables=GARAGE_VAR,mappings=GARAGE_MAPPINGS)),
                ('fence_mapper', Mapper(variables=FENCE_VAR,mappings=FENCE_MAPPINGS)),
                ('rare_encoder', RareLabelEncoder(tol=0.01, n_categories=1, variables=RARE_LABELS)),
                ('categorical_encoder', OrdinalEncoder(encoding_method='ordered', variables=RARE_LABELS)),
                ('scaler', MinMaxScaler()),
                ('rf', RandomForestRegressor(random_state=0))
                ])
                


In [24]:
pipe.fit(X_train,y_train)
print(mean_absolute_error(np.exp(pipe.predict(X_train)), np.exp(y_train)))

6525.135540300742


In [25]:
pred=np.exp(pipe.predict(X_test))
print(mean_absolute_error(pred, np.exp(y_test)))

16698.410369944668
