In [1]:
import os
import platform
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Check which platform is running the notebook
if platform.system() == 'Windows':
    PROJECT_PATH = "\\".join(os.getcwd().split('\\')[:-1])
else:
    # Assuming a Unix based platform
    PROJECT_PATH = "/".join(os.getcwd().split('/')[:-1])

DATA_PATH = os.path.join(PROJECT_PATH, 'data')
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train.csv')

train_data = pd.read_csv(TRAIN_DATA_PATH)
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Bringing in the columns from the previous notebook

In [2]:
numeric_cols = [
    'MSSubClass',
    'LotFrontage',
    'LotArea',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'MiscVal',
    'MoSold',
    'YrSold'
]

ordinal_cols_pt1 = [
    'BsmtQual',
    'BsmtExposure',
    'BsmtFinType1',
    'FireplaceQu'
]

ordinal_cols_pt2 = [
    'LotShape',
    'ExterQual',
    'BsmtQual',
    'BsmtExposure',
    'BsmtFinType1',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu'
]

categorical_cols = [
    'MSZoning',
    'LandContour',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'GarageType',
    'GarageFinish'
]

In [3]:
from sklearn.base import TransformerMixin

class CreateNewFeatures(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['BeenRemodelled'] = (X.YearBuilt == X.YearRemodAdd)
        X['HasGarage'] = X.GarageType.isna().astype(int)
        X['OverallRating'] = (X.OverallCond + X.OverallQual) / 2
        return X
    
    @staticmethod
    def get_features():
        return ['BeenRemodelled', 'HasGarage', 'OverallRating']

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

ordinal_pipeline_pt1 = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OrdinalEncoder())
])

ordinal_pipeline_pt2 = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

categorical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='None')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])


numeric_pipeline = Pipeline([
    ('gen', CreateNewFeatures()),
    ('garage_prep', ColumnTransformer([
        ('garage_fill', SimpleImputer(strategy='constant', fill_value=0), ['GarageYrBlt']),
        ('numeric_fill', SimpleImputer(strategy='median'), numeric_cols)
    ])),
    ('scaler', MinMaxScaler())
])

In [5]:
from sklearn.pipeline import FeatureUnion

processing_pipeline = FeatureUnion([
    ('numeric', numeric_pipeline),
    ('processing', ColumnTransformer([
        ('ordinal_pt1', ordinal_pipeline_pt1, ordinal_cols_pt1),
        ('ordinal_pt2', ordinal_pipeline_pt2, ordinal_cols_pt2),
        ('categorical', categorical_pipeline, categorical_cols)
    ])),
])

In [6]:
target_col = 'SalePrice'

X = train_data.drop(target_col, axis=1)
y = train_data[target_col]

In [7]:
from sklearn.model_selection import train_test_split

random_state = 42
test_size = 0.3

# Split into the training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

print('Training size: ', X_train.shape[0])
print('Test size: ', X_test.shape[0])

Training size:  1022
Test size:  438


In [8]:
X_train_processed = processing_pipeline.fit_transform(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### Grid Search on Extra Trees

In [9]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

extra_tree_param_grid = [
    {
        'n_estimators': [100, 150, 200, 250, 300, 350, 400],
        'criterion': ['mse', 'mae'],
    }
]

extra_reg = ExtraTreesRegressor(random_state=random_state)
extra_tree_grid_search = GridSearchCV(extra_reg,
                                      extra_tree_param_grid,
                                      cv=3,
                                      n_jobs=4,
                                      verbose=2,
                                      scoring='neg_mean_squared_log_error')
extra_tree_grid_search.fit(X_train_processed, y_train)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  4.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                           criterion='mse', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           max_samples=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100, n_jobs=None,
                                           oob_score=False, random_state=42,
                                           verbose=0, warm_start=False),
             iid='deprecated', n_jobs=4,
             param_gr

In [10]:
extra_tree_grid_search.best_params_

{'criterion': 'mse', 'n_estimators': 300}

In [11]:
np.sqrt(-extra_tree_grid_search.best_score_)

0.16523463477065678

In [12]:
best_extra_tree = extra_tree_grid_search.best_estimator_

### Grid Search on SVR

In [13]:
from sklearn.svm import SVR

svr_param_grid = [
    {
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'degree': [3, 6, 9, 12],
        'C': [1, 10, 100, 1000],
        'gamma': [1e-3, 1e-4, 1e-5, 1e-6],
    }
]

svr = SVR()
svr_grid_search = GridSearchCV(svr,
                               svr_param_grid,
                               cv=3,
                               n_jobs=4,
                               verbose=2,
                               scoring='neg_mean_squared_log_error')
svr_grid_search.fit(X_train_processed, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  58 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 300 tasks      | elapsed:   11.2s
[Parallel(n_jobs=4)]: Done 576 out of 576 | elapsed:   21.2s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=4,
             param_grid=[{'C': [1, 10, 100, 1000], 'degree': [3, 6, 9, 12],
                          'gamma': [0.001, 0.0001, 1e-05, 1e-06],
                          'kernel': ['rbf', 'poly', 'sigmoid']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_log_error', verbose=2)

In [14]:
svr_grid_search.best_params_

{'C': 1000, 'degree': 3, 'gamma': 0.001, 'kernel': 'rbf'}

In [15]:
np.sqrt(-svr_grid_search.best_score_)

0.3774384902227543

### Creating the full stack

In [16]:
from sklearn.ensemble import RandomForestRegressor

rand_forest_params = {
    'bootstrap': True,
    'ccp_alpha': 0.0,
    'criterion': 'mse',
    'max_depth': None,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'max_samples': None,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'n_estimators': 400,
    'n_jobs': None,
    'oob_score': False,
    'random_state': 42,
    'verbose': 0,
    'warm_start': True
}

In [17]:
extra_trees_params = {
    'criterion': 'mse',
    'n_estimators': 350,
    'random_state': 42
}

svr_params = {
    'C': 1000,
    'degree': 3,
    'gamma': 0.001,
    'kernel': 'rbf'
}

In [18]:
class TargetTransformers(object):
    @staticmethod
    def log(val):
        return np.log1p(val)
    
    @staticmethod
    def inverse_log(val):
        return np.exp(val) - 1

In [19]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

regressor_stack = StackingRegressor([
    ('rand_forest', RandomForestRegressor(**rand_forest_params)),
    ('extra_trees', ExtraTreesRegressor(**extra_trees_params)),
    ('svr', SVR(**svr_params))
])

ml_model = TransformedTargetRegressor(regressor=regressor_stack,
                                      func=TargetTransformers.log,
                                      inverse_func=TargetTransformers.inverse_log)

### Applying cross validation to the stack

In [20]:
from utils.core import *

scores = cross_val_regression(ml_model, X_train_processed, y_train)
print('Best Params Stacking RMSE: {:.6f}'.format(scores['rmse']))
print('Best Params Stacking RMSLE: {:.6f}'.format(scores['rmlse']))

Best Params Stacking RMSE: 29288.665056
Best Params Stacking RMSLE: 0.149463


In [21]:
# Create the full pipelines
full_ml_pipeline = Pipeline([
    ('process', processing_pipeline),
    ('ml', ml_model)
])

In [22]:
# Training the full pipeline
full_ml_pipeline.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Pipeline(memory=None,
         steps=[('process',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('numeric',
                                                 Pipeline(memory=None,
                                                          steps=[('gen',
                                                                  <__main__.CreateNewFeatures object at 0x00000196CA4A26A0>),
                                                                 ('garage_prep',
                                                                  ColumnTransformer(n_jobs=None,
                                                                                    remainder='drop',
                                                                                    sparse_threshold=0.3,
                                                                                    transformer_weights=None,
                                                                                 

In [24]:
# Run against the training set
y_pred = full_ml_pipeline.predict(X_train)
rmsle = root_mean_log_error(y_train, y_pred)
print('RMSLE: {:.6f}'.format(rmsle))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


RMSLE: 0.038045


In [25]:
# Run against the test set
y_pred = full_ml_pipeline.predict(X_test)
rmsle = root_mean_log_error(y_test, y_pred)
print('Test RMSLE: {:.6f}'.format(rmsle))

Test RMSLE: 0.138165


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Compared to the previous model, this one slightly under performs to previous attempts when executed on the test set. Still worth making a submission to see how it performs

## Creating a submission with the new stack model

In [26]:
TEST_DATA = os.path.join(DATA_PATH, 'test.csv')
test_data = pd.read_csv(TEST_DATA)

In [28]:
submission_pred = full_ml_pipeline.predict(test_data)

In [29]:
ids = test_data['Id']

submission_data = {
    'Id': ids,
    'SalePrice': submission_pred
}

submission_file = os.path.join(DATA_PATH, 'submission_v4.csv')
submussion_df = pd.DataFrame.from_dict(submission_data)
# submussion_df.to_csv(submission_file, index=False)

As expected, this submission didn't improve the results even after applying grid search.

## Trying a Gradient Boosting enemble