In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.tree import DecisionTreeClassifier

In [None]:
set_config(display='diagram')

In [None]:
trainData = 'train.csv'
testData = 'test.csv'
index_col = 'Id'
target_col = 'SalePrice'

In [None]:
def cleanData(X):
    """ Handle outlier remove outside of Pipeline """
    X = X[X['GrLivArea'] < 4000]
    return X

In [None]:
class processHousePrices:
    
    def __init__(self):
        self._prepared = False
        # Store each feature in a defined group
        self._features = ({
            'categorical': ([
                'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',  
                'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 
                'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 
                'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
                'MasVnrType', 'Foundation', 'Heating', 'Electrical', 
                'GarageType', 'GarageYrBlt', 'YearBuilt', 'MiscFeature', 
                'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 
                'YearRemodAdd']),
            'ordinal': ([
                'LandSlope', 'OverallQual', 'ExterQual', 'ExterCond', 
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 
                'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 
                'PavedDrive', 'PoolQC', 'Fence', 'OverallCond']),
            'count': ([
                'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
                'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
                'TotRmsAbvGrd', 'Fireplaces', 'GarageCars']),
            'realMult': ([
                'LotFrontage', 'LotArea', 'MasVnrArea',
                'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                'LowQualFinSF', 'GrLivArea', 'GarageArea',
                'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']),
            'binary': ([
                'CentralAir']),
            'realAdd': [],
            'binomial': []
            })
        # Define encoding of ordinal categories to numeric
        self._ordinalEncoding = ({
            'Alley'       : {'None': 0, 'Grvl': 1, 'Pave': 2},
            'BsmtCond'    : {'No': 0, 'Po': 1, 'Fa': 2, 
                             'TA': 3, 'Gd': 4, 'Ex': 5},
            'BsmtExposure': {'No': 0, 'Mn': 1, 'Av': 2, 'Gd': 3},
            'BsmtFinType1': {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec' : 3, 
                             'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
            'BsmtFinType2': {'No': 0, 'Unf': 1, 'LwQ': 2, 'Rec' : 3, 
                             'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
            'BsmtQual'    : {'No': 0, 'Po': 1, 'Fa': 2, 
                             'TA': 3, 'Gd': 4, 'Ex': 5},
            'ExterCond'   : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
            'ExterQual'   : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
            'FireplaceQu' : {'No': 0, 'Po': 1, 'Fa': 2, 
                             'TA': 3, 'Gd': 4, 'Ex': 5},
            'Functional'  : {'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1' : 4, 
                             'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8},
            'GarageCond'  : {'No': 0, 'Po': 1, 'Fa': 2, 
                             'TA': 3, 'Gd': 4, 'Ex': 5},
            'GarageQual'  : {'No': 0, 'Po': 1, 'Fa': 2, 
                             'TA': 3, 'Gd': 4, 'Ex': 5},
            'HeatingQC'   : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
            'KitchenQual' : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
            'LandSlope'   : {'Sev': 1, 'Mod': 2, 'Gtl': 3},
            'LotShape'    : {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg' : 4},
            'PavedDrive'  : {'N': 0, 'P': 1, 'Y': 2},
            'PoolQC'      : {'No': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
            'Street'      : {'Grvl': 1, 'Pave': 2},
            'Utilities'   : {'ELO': 1, 'NoSeWa': 2, 
                             'NoSewr':  3, 'AllPub': 4},
            'Fence'       : {'No': 0, 'MnWw': 1, 'GdWo': 2, 
                             'MnPrv': 3, 'GdPrv': 4},
            'GarageFinish': {'No': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
            'OverallQual' : {},
            'OverallCond' : {},
        })

        
    def _allFeatures(self):
        """ Return all values in features """
        return [ft for group in list(self._features.values()) for ft in group]

    
    def _assertValidFeatures(self):
        """ Assert feature dictionary matches the input """
        assert sorted(self._allFeatures()) == sorted(list(self._X.columns))
        
        
    def _remapOrdinal(self):
        """ Perform remapping of ordinal strings to numeric """
        self._X = self._X.replace(self._ordinalEncoding)
        
        
    def _updateFeatureDict(self, featureName, group):
        assert featureName not in self._features.values()
        self._features[group].append(featureName)
        
        
    def _categoriesToString(self):
        """ Ensure all categories are string. Add prefix to prevent
            casting back to numeric. """
        for feature in self._features['categorical']:
            self._X[feature] = 'str' + self._X[feature].astype(str)
           
        
    def prepData(self, data):
        self._X = data
        self._remapOrdinal()
        self._addFeatures()
        self._categoriesToString()
        self._assertValidFeatures()
        self._prepared = True
        return self._X
    
    
    def getFeatures(self):
        """ Return feature dictionary with updated categories """
        assert self._prepared, 'Must called prepData() method first'    
        return self._features
            
            
    def _addFeatures(self):
        self._X['hasPool'] = (self._X['PoolArea'] > 1) * 1
        self._updateFeatureDict('hasPool', 'binary')
        self._X['has2ndfloor'] = (self._X['2ndFlrSF'] > 1) * 1
        self._updateFeatureDict('has2ndfloor', 'binary')
        self._X['hasgarage'] = (self._X['GarageArea'] > 1) * 1
        self._updateFeatureDict('hasgarage', 'binary')
        self._X['hasbsmt'] = (self._X['TotalBsmtSF'] > 1) * 1
        self._updateFeatureDict('hasbsmt', 'binary')
        self._X['hasfireplace'] = (self._X['Fireplaces'] > 1) * 1
        self._updateFeatureDict('hasfireplace', 'binary')
          

In [None]:
# Initiliase pre-model data processing
processor = processHousePrices()

In [None]:
X = cleanData(pd.read_csv(trainData, index_col='Id'))
y = X.pop('SalePrice')
X = processor.prepData(X)
features = processor.getFeatures()

In [None]:
split = train_test_split(X, y, random_state=0, train_size=0.8, test_size=0.2)
X_train, X_valid, y_train, y_valid = map(lambda x: x.copy(), split)

In [None]:
realMultTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler',  RobustScaler()),
    ('power',   PowerTransformer(method='yeo-johnson'))])
countTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler',  RobustScaler())])
categoryTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot' , OneHotEncoder(handle_unknown='ignore'))])
ordinalTransfomer = Pipeline(steps=[
    ('imputer', SimpleImputer())])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('realMult', realMultTransformer, features['realMult']),
    ('count',    countTransformer,    features['count']),
    ('category', categoryTransformer, features['categorical'] + features['binary']),
    ('ordinal',  ordinalTransfomer,   features['ordinal'])])

In [None]:
# Define regressor and wrap around transformed target regressor
regressor = XGBRegressor(n_jobs=12, random_state=0)
model = TransformedTargetRegressor(
    regressor=regressor, func=np.log1p, inverse_func=np.expm1)

In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('feature_selection', SelectFromModel(DecisionTreeClassifier())),
                      ('classifier',   model)])

In [None]:
param_grid = {
    'preprocessor__realMult__imputer__strategy': ['median', 'mean'],
    'preprocessor__count__imputer__strategy':    ['median', 'most_frequent', 'constant'],
    'preprocessor__category__imputer__strategy': ['constant', 'most_frequent'],
    'preprocessor__ordinal__imputer__strategy':  ['constant', 'most_frequent'],
    'classifier__regressor__n_estimators': [100, 500, 1000],
    'classifier__regressor__learning_rate': [0.01, 0.05, 0.1]
}
print(f'Parameter combindations: {len(ParameterGrid(param_grid))}')

In [None]:
# Configure the cross-validation procedure
cv = KFold(n_splits=12, shuffle=True, random_state=1)

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=cv, refit=True, verbose=10)

In [None]:
grid_search.fit(X, y)

In [None]:
best_params = grid_search.best_params_
print(best_params)

In [None]:
preds = grid_search.predict(X_valid)
print(mean_squared_error(np.log(y_valid), np.log(preds), squared=False))

output = pd.DataFrame({'Id': X_valid.index,
                       'SalePrice': preds})
output
output.to_csv('submission.csv', index=False)

In [None]:
output

In [None]:
X_test = pd.read_csv(testData, index_col='Id')
# Initiliase pre-model data processing
processor = processHousePrices()
X_test = processor.prepData(X_test)
preds = grid_search.predict(X_test)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds})
output
output.to_csv('submission.csv', index=False)

In [None]:
    if False:
        X['Alley'] = X['Alley'].fillna('None')
        # BedroomAbvGr : NA most likely means 0
        X['BedroomAbvGr'] = X['BedroomAbvGr'].fillna(0)
        # BsmtQual etc : data description says NA for basement features is 'no basement'
        X['BsmtQual'] = X['BsmtQual'].fillna('No')
        X['BsmtCond'] = X['BsmtCond'].fillna('No')
        X['BsmtExposure'] = X['BsmtExposure'].fillna('No')
        X['BsmtFinType1'] = X['BsmtFinType1'].fillna('No')
        X['BsmtFinType2'] = X['BsmtFinType2'].fillna('No')
        X['BsmtFullBath'] = X['BsmtFullBath'].fillna(0)
        X['BsmtHalfBath'] = X['BsmtHalfBath'].fillna(0)
        X['BsmtUnfSF'] = X['BsmtUnfSF'].fillna(0)
        # CentralAir : NA most likely means No
        X['CentralAir'] = X['CentralAir'].fillna('N')
        # Condition : NA most likely means Normal
        X['Condition1'] = X['Condition1'].fillna('Norm')
        X['Condition2'] = X['Condition2'].fillna('Norm')
        # EnclosedPorch : NA most likely means no enclosed porch
        X['EnclosedPorch'] = X['EnclosedPorch'].fillna(0)
        # External stuff : NA most likely means average
        X['ExterCond'] = X['ExterCond'].fillna('TA')
        X['ExterQual'] = X['ExterQual'].fillna('TA')
        # Fence : data description says NA means 'no fence'
        X['Fence'] = X['Fence'].fillna('No')
        # FireplaceQu : data description says NA means 'no fireplace'
        X['FireplaceQu'] = X['FireplaceQu'].fillna('No')
        X['Fireplaces'] = X['Fireplaces'].fillna(0)
        # Functional : data description says NA means typical
        X['Functional'] = X['Functional'].fillna('Typ')
        # GarageType etc : data description says NA for garage features is 'no garage'
        X['GarageType'] = X['GarageType'].fillna('No')
        X['GarageFinish'] = X['GarageFinish'].fillna('No')
        X['GarageQual'] = X['GarageQual'].fillna('No')
        X['GarageCond'] = X['GarageCond'].fillna('No')
        X['GarageArea'] = X['GarageArea'].fillna(0)
        X['GarageCars'] = X['GarageCars'].fillna(0)
        # HalfBath : NA most likely means no half baths above grade
        X['HalfBath'] = X['HalfBath'].fillna(0)
        # HeatingQC : NA most likely means typical
        X['HeatingQC'] = X['HeatingQC'].fillna('TA')
        # KitchenAbvGr : NA most likely means 0
        X['KitchenAbvGr'] = X['KitchenAbvGr'].fillna(0)
        # KitchenQual : NA most likely means typical
        X['KitchenQual'] = X['KitchenQual'].fillna('TA')
        # LotFrontage : NA most likely means no lot frontage
        X['LotFrontage'] = X['LotFrontage'].fillna(0)
        # LotShape : NA most likely means regular
        X['LotShape'] = X['LotShape'].fillna('Reg')
        # MasVnrType : NA most likely means no veneer
        X['MasVnrType'] = X['MasVnrType'].fillna('None')
        X['MasVnrArea'] = X['MasVnrArea'].fillna(0)
        # MiscFeature : data description says NA means 'no misc feature'
        X['MiscFeature'] = X['MiscFeature'].fillna('No')
        X['MiscVal'] = X['MiscVal'].fillna(0)
        # OpenPorchSF : NA most likely means no open porch
        X['OpenPorchSF'] = X['OpenPorchSF'].fillna(0)
        # PavedDrive : NA most likely means not paved
        X['PavedDrive'] = X['PavedDrive'].fillna('N')
        # PoolQC : data description says NA means 'no pool'
        X['PoolQC'] = X['PoolQC'].fillna('No')
        X['PoolArea'] = X['PoolArea'].fillna(0)
        # SaleCondition : NA most likely means normal sale
        X['SaleCondition'] = X['SaleCondition'].fillna('Normal')
        # ScreenPorch : NA most likely means no screen porch
        X['ScreenPorch'] = X['ScreenPorch'].fillna(0)
        # TotRmsAbvGrd : NA most likely means 0
        X['TotRmsAbvGrd'] = X['TotRmsAbvGrd'].fillna(0)
        # Utilities : NA most likely means all public utilities
        X['Utilities'] = X['Utilities'].fillna('AllPub')
        # WoodDeckSF : NA most likely means no wood deck
        X['WoodDeckSF'] = X['WoodDeckSF'].fillna(0)

In [None]:
grid_search.