# Capstone 2 Housing Prices - Pycaret Modeling <a id='pre-Modelling'></a>


Goal: Create a cleaned development dataset you can use to complete the
modeling step of your project.


● Create dummy or indicator features for categorical variables

● Standardize the magnitude of numeric features using a scaler

● Address outliers for numerical data (Skewness)

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.graph_objects as go
import sklearn
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer



import sys
sys.path.insert(1, '/Users/swechakranthi/Desktop/Github/DataScience-Capstone2-Housing/library')
from sb_utils import save_file

In [2]:
#https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html

def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [3]:
train_data = pd.read_csv('../data/train_data-FE.csv')
test_data = pd.read_csv('../data/test_data-FE.csv')

## First, Separate independent and Dependent data

In [4]:
#X = train_data.drop('SalePrice', axis=1)
#y = train_data['SalePrice'].to_frame()


X = train_data.drop('SalePrice', axis=1)
Y = train_data['SalePrice'].to_frame()

#X.set_index('Id',inplace=True)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=44)

In [5]:
#set the ID value as index so that it is not used in coorelations

#test_data.set_index('Id',inplace=True)


test_data.head()



Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0,0,1,0,0,0,1,0,1,0,...,730.0,140,0,0,0,120,0,0,6,2010
1,0,0,0,1,0,0,1,0,1,0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,0,0,0,1,0,0,1,0,1,0,...,482.0,212,34,0,0,0,0,0,3,2010
3,0,0,0,1,0,0,1,0,1,0,...,470.0,360,36,0,0,0,0,0,6,2010
4,0,0,0,1,0,0,1,0,1,0,...,506.0,0,82,0,0,144,0,0,1,2010


In [6]:
X.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0,0,0,1,0,0,1,0,1,0,...,548.0,0,61,0,0,0,0,0,2,2008
1,0,0,0,1,0,0,1,0,1,0,...,460.0,298,0,0,0,0,0,0,5,2007
2,0,0,0,1,0,0,1,0,1,0,...,608.0,0,42,0,0,0,0,0,9,2008
3,0,0,0,1,0,0,1,0,1,0,...,642.0,0,35,272,0,0,0,0,2,2006
4,0,0,0,1,0,0,1,0,1,0,...,836.0,192,84,0,0,0,0,0,12,2008


In [7]:
#first, get 2 lists of all numeric and catagorical columns

numeric_cols = X.select_dtypes(exclude='object').columns




categ_cols = X.dtypes[X.dtypes == np.object]        # filtering by categorical variables
categ_cols = categ_cols.index.tolist()                # list of categorical fields

categ_cols_test = test_data.dtypes[test_data.dtypes == np.object]        # filtering by categorical variables
categ_cols_test = categ_cols_test.index.tolist()  

print('Are both test and train catagorical columns same? ',categ_cols == categ_cols_test)

#convert object types into catagorical dtypes
#X[categ_cols] = X[categ_cols].astype("category")
#test_data[categ_cols_test] = test_data[categ_cols_test].astype("category")


categ_cols_test
test_data.dtypes


Are both test and train catagorical columns same?  True


MSZoning_C (all)    int64
MSZoning_FV         int64
MSZoning_RH         int64
MSZoning_RL         int64
MSZoning_RM         int64
                    ...  
ScreenPorch         int64
PoolArea            int64
MiscVal             int64
MoSold              int64
YrSold              int64
Length: 302, dtype: object

### We set all the data transformations under a column transformer pipeline

In [8]:
from sklearn.compose import make_column_selector as selector


numeric_features = ['numeric_cols']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

power_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformer', PowerTransformer())])

categorical_features = categ_cols
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
      #  ('boxcox', power_transformer, numeric_cols),
        ('cat', categorical_transformer, categ_cols)])

#train_data2 = preprocessor.fit_transform(train_data)

In [9]:
#list the null value count of all columns
pd.set_option('display.max_rows', 90)
print("List of null values for each column: \n")
Missing_count = train_data.isna().sum().sort_values(ascending = False)[:25]
missing = pd.concat([Missing_count, (Missing_count/len(train_data))*100], axis=1, keys = ["count", "%"])
missing_type = pd.concat([train_data.isna().sum().sort_values(ascending = False), (train_data.isna().sum().sort_values(ascending = False)/len(train_data))*100,train_data.dtypes], axis=1, keys = ["count", "%", "types"])
print(missing_type.sort_values(by=["count"], ascending = False))
print('***********************************************************************')

print(f'Number of duplicate rows: {train_data.duplicated().sum()} ')


print(f'null row counts: {missing.astype(bool).sum(axis=0)}')

List of null values for each column: 

                  count    %    types
MSZoning_C (all)      0  0.0    int64
Utilities_NoSeWa      0  0.0    int64
MSZoning_RH           0  0.0    int64
MSZoning_RL           0  0.0    int64
MSZoning_RM           0  0.0    int64
...                 ...  ...      ...
PoolArea              0  0.0    int64
MiscVal               0  0.0    int64
MoSold                0  0.0    int64
YrSold                0  0.0    int64
SalePrice             0  0.0  float64

[303 rows x 3 columns]
***********************************************************************
Number of duplicate rows: 0 
null row counts: count    0
%        0
dtype: int64


In [10]:
# Importing module and initializing setup
from pycaret.regression import *
reg1 = setup(data = train_data, target = 'SalePrice', normalize = True, feature_selection = True)

# return best model
best = compare_models()

# return top 3 models based on 'R2'
top3 = compare_models(n_select = 3)

# return best model based on MAPE
best = compare_models(sort = 'MAPE') #default is 'R2'

# compare specific models
best_specific = compare_models(include = ['dt','rf','xgboost'])

# blacklist certain models
best_specific = compare_models(exclude = ['catboost', 'svm'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.0809,0.0141,0.1175,0.9084,0.0092,0.0068,0.296
ridge,Ridge Regression,0.0846,0.0151,0.1221,0.9012,0.0095,0.0071,0.032
omp,Orthogonal Matching Pursuit,0.0874,0.016,0.1258,0.8949,0.0098,0.0073,0.037
huber,Huber Regressor,0.0886,0.0173,0.1303,0.8863,0.0102,0.0074,0.476
gbr,Gradient Boosting Regressor,0.0922,0.018,0.1332,0.8826,0.0104,0.0077,0.492
lightgbm,Light Gradient Boosting Machine,0.0943,0.0196,0.139,0.8719,0.0108,0.0079,0.787
xgboost,Extreme Gradient Boosting,0.0978,0.0205,0.142,0.8649,0.0111,0.0082,2.236
rf,Random Forest Regressor,0.1002,0.0219,0.1468,0.8571,0.0114,0.0084,1.604
par,Passive Aggressive Regressor,0.1098,0.0235,0.1523,0.8452,0.0119,0.0092,0.069
et,Extra Trees Regressor,0.1112,0.0271,0.1637,0.8221,0.0128,0.0093,2.008


In [11]:
br_model = create_model('br')
# tune hyperparameters of decision tree
##tuned_gbr = tune_model(gbr_model)
# tune hyperparameters with increased n_iter
##tuned_gbr = tune_model(gbr_model, n_iter = 50)
# tune hyperparameters to optimize MAE
##tuned_gbr = tune_model(gbr_model, optimize = 'MAE') 

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0768,0.0185,0.136,0.8787,0.0109,0.0065
1,0.0927,0.0188,0.1372,0.9015,0.0108,0.0078
2,0.0959,0.0206,0.1436,0.8752,0.0112,0.0081
3,0.0848,0.0154,0.1243,0.9136,0.0097,0.0072
4,0.0738,0.0107,0.1035,0.9078,0.008,0.0062
5,0.0716,0.0095,0.0977,0.9265,0.0076,0.006
6,0.0793,0.0116,0.1075,0.9204,0.0083,0.0066
7,0.0808,0.0106,0.103,0.9252,0.0079,0.0067
8,0.0753,0.0109,0.1042,0.93,0.0081,0.0063
9,0.0783,0.014,0.1183,0.9049,0.0094,0.0066


In [26]:
# Baysian Ridge Parameters
parameters = {'alpha_init':[1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.9],
              'lambda_init': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-9]
                }

In [27]:
#parameters = {'learning_rate': [0.1,0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#                  'subsample'    : [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1],
#                  'n_estimators' : [100,200,300,400,500,600,700,800,900,1000],
#                  'max_depth'    : [1,2,3,4,5,6,7,8,9,10]
#                 }
#custom_grid = parameters

In [28]:
tuned_br_custom = tune_model(br_model, custom_grid = parameters)

pred_holdout = predict_model(tuned_br_custom)


Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0768,0.0185,0.136,0.8787,0.0109,0.0065
1,0.0927,0.0188,0.1372,0.9015,0.0108,0.0078
2,0.0959,0.0206,0.1436,0.8752,0.0112,0.0081
3,0.0848,0.0154,0.1243,0.9136,0.0097,0.0072
4,0.0738,0.0107,0.1035,0.9078,0.008,0.0062
5,0.0716,0.0095,0.0977,0.9265,0.0076,0.006
6,0.0793,0.0116,0.1075,0.9204,0.0083,0.0066
7,0.0808,0.0106,0.103,0.9252,0.0079,0.0067
8,0.0753,0.0109,0.1042,0.93,0.0081,0.0063
9,0.0783,0.014,0.1183,0.9049,0.0094,0.0066


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,0.0957,0.0384,0.1958,0.7775,0.0143,0.008


In [29]:
br_model.get_params().keys()

dict_keys(['alpha_1', 'alpha_2', 'alpha_init', 'compute_score', 'copy_X', 'fit_intercept', 'lambda_1', 'lambda_2', 'lambda_init', 'n_iter', 'normalize', 'tol', 'verbose'])

In [30]:
test_data.shape

(1459, 302)

In [31]:
train_data.shape

(1460, 303)

In [32]:
br_final = finalize_model(br_model)

predictions = predict_model(br_final, data = test_data)


In [33]:
predictions

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Label
0,0,0,1,0,0,0,1,0,1,0,...,140,0,0,0,120,0,0,6,2010,11.680172
1,0,0,0,1,0,0,1,0,1,0,...,393,36,0,0,0,0,12500,6,2010,11.932524
2,0,0,0,1,0,0,1,0,1,0,...,212,34,0,0,0,0,0,3,2010,12.090724
3,0,0,0,1,0,0,1,0,1,0,...,360,36,0,0,0,0,0,6,2010,12.195065
4,0,0,0,1,0,0,1,0,1,0,...,0,82,0,0,144,0,0,1,2010,12.169735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,6,2006,11.356987
1455,0,0,0,0,1,0,1,0,1,0,...,0,24,0,0,0,0,0,4,2006,11.335861
1456,0,0,0,1,0,0,1,0,1,0,...,474,0,0,0,0,0,0,9,2006,12.017504
1457,0,0,0,1,0,0,1,0,1,0,...,80,32,0,0,0,0,700,7,2006,11.643122


In [34]:
y_pred = np.exp(predictions)

In [35]:
y_pred

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_None,Alley_Pave,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Label
0,1.0,1.0,2.718282,1.000000,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,6.327432e+60,1.000000e+00,1.0,1.0,1.304181e+52,1.0,1.000000e+00,403.428793,inf,118204.588447
1,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,4.761364e+170,4.311232e+15,1.0,1.0,1.000000e+00,1.0,inf,403.428793,inf,152135.088013
2,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,1.176062e+92,5.834617e+14,1.0,1.0,1.000000e+00,1.0,1.000000e+00,20.085537,inf,178211.081077
3,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,2.218265e+156,4.311232e+15,1.0,1.0,1.000000e+00,1.0,1.000000e+00,403.428793,inf,197810.621999
4,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,1.000000e+00,4.093997e+35,1.0,1.0,3.454661e+62,1.0,1.000000e+00,2.718282,inf,192862.857119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.0,1.0,1.000000,1.000000,2.718282,1.0,2.718282,1.0,2.718282,1.0,...,1.000000e+00,1.000000e+00,1.0,1.0,1.000000e+00,1.0,1.000000e+00,403.428793,inf,85561.162768
1455,1.0,1.0,1.000000,1.000000,2.718282,1.0,2.718282,1.0,2.718282,1.0,...,1.000000e+00,2.648912e+10,1.0,1.0,1.000000e+00,1.0,1.000000e+00,54.598150,inf,83772.619515
1456,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,7.171078e+205,1.000000e+00,1.0,1.0,1.000000e+00,1.0,1.000000e+00,8103.083928,inf,165628.715817
1457,1.0,1.0,1.000000,2.718282,1.000000,1.0,2.718282,1.0,2.718282,1.0,...,5.540622e+34,7.896296e+13,1.0,1.0,1.000000e+00,1.0,1.014232e+304,1096.633158,inf,113905.276324


In [36]:
test = pd.read_csv('../data/test.csv')

In [37]:
test['Id']

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [38]:
#final = pd.DataFrame([test.ID, y_pred.Label]).transpose()
final = pd.concat([test['Id'],y_pred['Label']], axis = 1)

In [39]:
final

Unnamed: 0,Id,Label
0,1461,118204.588447
1,1462,152135.088013
2,1463,178211.081077
3,1464,197810.621999
4,1465,192862.857119
...,...,...
1454,2915,85561.162768
1455,2916,83772.619515
1456,2917,165628.715817
1457,2918,113905.276324
