# Capstone 2 Housing Prices - Pre-processing and Training Data Development <a id='pre-processing'></a>


Goal: Create a cleaned development dataset you can use to complete the
modeling step of your project.


● Create dummy or indicator features for categorical variables

● Standardize the magnitude of numeric features using a scaler

● Address outliers for numerical data (Skewness)

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.graph_objects as go
import sklearn
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer



import sys
sys.path.insert(1, '/Users/swechakranthi/Desktop/Github/DataScience-Capstone2-Housing/library')
from sb_utils import save_file

In [2]:
#https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html

def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [3]:
train_data = pd.read_csv('../data/train_data-cleaned.csv')
test_data = pd.read_csv('../data/test_data-cleaned.csv')

## First, Separate independent and Dependent data

In [4]:
#X = train_data.drop('SalePrice', axis=1)
#y = train_data['SalePrice'].to_frame()


X = train_data.drop('SalePrice', axis=1)
Y = train_data['SalePrice'].to_frame()

X.set_index('Id',inplace=True)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=44)

In [5]:
#set the ID value as index so that it is not used in coorelations

test_data.set_index('Id',inplace=True)


test_data.head()



Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [6]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [7]:
#first, get 2 lists of all numeric and catagorical columns

numeric_cols = X.select_dtypes(exclude='object').columns




categ_cols = X.dtypes[X.dtypes == np.object]        # filtering by categorical variables
categ_cols = categ_cols.index.tolist()                # list of categorical fields

categ_cols_test = test_data.dtypes[test_data.dtypes == np.object]        # filtering by categorical variables
categ_cols_test = categ_cols_test.index.tolist()  

print('Are both test and train catagorical columns same? ',categ_cols == categ_cols_test)

#convert object types into catagorical dtypes
#X[categ_cols] = X[categ_cols].astype("category")
#test_data[categ_cols_test] = test_data[categ_cols_test].astype("category")


categ_cols_test
test_data.dtypes


Are both test and train catagorical columns same?  True


MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

### We set all the data transformations under a column transformer pipeline

In [8]:
from sklearn.compose import make_column_selector as selector


numeric_features = ['numeric_cols']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

power_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformer', PowerTransformer())])

categorical_features = categ_cols
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
      #  ('boxcox', power_transformer, numeric_cols),
        ('cat', categorical_transformer, categ_cols)])

In [9]:
#

In [10]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', LogisticRegression(solver='lbfgs'))]) 


clf.fit(X_train,Y_train)  
Y_pred = clf.predict(X_test)
clf.score(X_test,Y_test)





0.0045662100456621

In [11]:
from sklearn.ensemble import RandomForestRegressor

forest = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', RandomForestRegressor(n_estimators =40, random_state = 0))]) 
forest.fit(X_train,Y_train)  
Y_pred = forest.predict(X_test)
forest.score(X_test,Y_test)

0.8105533230208257

In [12]:
import xgboost as xg

xgb = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 50, seed = 123))]) 
xgb.fit(X_train,Y_train)  
Y_pred = xgb.predict(X_test)
xgb.score(X_test,Y_test)



0.7777617185273393

In [13]:
from sklearn.linear_model import RidgeCV,Lasso,ElasticNet

ridge = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', RidgeCV(alphas=[0.1, 1.0, 10.0]))]) 
ridge.fit(X_train,Y_train)  
Y_pred = ridge.predict(X_test)
ridge.score(X_test,Y_test)



0.8465079669202853

In [14]:
from sklearn.metrics import r2_score,mean_squared_error

print("R2 score",r2_score(Y_test,Y_pred))
print("RMSE",np.sqrt(mean_squared_error(Y_test,Y_pred)))

R2 score 0.8465079669202853
RMSE 29562.904472606468


In [15]:

model_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', Lasso(alpha=0.1))]) 
model_lasso.fit(X_train, Y_train) 
pred_test_lasso= model_lasso.predict(X_test)

print("R2 score",r2_score(Y_test,pred_test_lasso))
print("RMSE",np.sqrt(mean_squared_error(Y_test,pred_test_lasso)))

R2 score 0.801807706674696
RMSE 33592.90503689818


In [16]:
X_test.shape

(438, 79)

In [17]:
model_enet = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', ElasticNet(alpha = 0.1))]) 
model_enet.fit(X_train, Y_train) 
pred_test_enet= model_enet.predict(X_test)

print("R2 score",r2_score(Y_test,pred_test_enet))
print("RMSE",np.sqrt(mean_squared_error(Y_test,pred_test_enet)))


R2 score 0.8481915570431683
RMSE 29400.325841592945


In [18]:
pred_test_lasso
Y_test

Unnamed: 0,SalePrice
1140,139000
132,150750
642,345000
952,133900
1297,140000
...,...
523,184750
1004,181000
85,260000
1414,207000


In [19]:
y_test = clf.predict(test_data)

In [20]:
test_data.shape

(1459, 79)

In [21]:
clf.score(test_data, y_test)

1.0

In [22]:
X_test = preprocessor.transform(test_data)

In [23]:
get_feature_names(preprocessor)


['num__MSSubClass',
 'num__LotFrontage',
 'num__LotArea',
 'num__OverallQual',
 'num__OverallCond',
 'num__YearBuilt',
 'num__YearRemodAdd',
 'num__MasVnrArea',
 'num__BsmtFinSF1',
 'num__BsmtFinSF2',
 'num__BsmtUnfSF',
 'num__TotalBsmtSF',
 'num__1stFlrSF',
 'num__2ndFlrSF',
 'num__LowQualFinSF',
 'num__GrLivArea',
 'num__BsmtFullBath',
 'num__BsmtHalfBath',
 'num__FullBath',
 'num__HalfBath',
 'num__BedroomAbvGr',
 'num__KitchenAbvGr',
 'num__TotRmsAbvGrd',
 'num__Fireplaces',
 'num__GarageYrBlt',
 'num__GarageCars',
 'num__GarageArea',
 'num__WoodDeckSF',
 'num__OpenPorchSF',
 'num__EnclosedPorch',
 'num__3SsnPorch',
 'num__ScreenPorch',
 'num__PoolArea',
 'num__MiscVal',
 'num__MoSold',
 'num__YrSold',
 'onehot__x0_C (all)',
 'onehot__x0_FV',
 'onehot__x0_RH',
 'onehot__x0_RL',
 'onehot__x0_RM',
 'onehot__x1_Grvl',
 'onehot__x1_Pave',
 'onehot__x2_Grvl',
 'onehot__x2_None',
 'onehot__x2_Pave',
 'onehot__x3_IR1',
 'onehot__x3_IR2',
 'onehot__x3_IR3',
 'onehot__x3_Reg',
 'onehot__x4_

In [24]:
X_test

<1459x297 sparse matrix of type '<class 'numpy.float64'>'
	with 115255 stored elements in Compressed Sparse Row format>