# Capstone 2 Housing Prices - Pre-processing and Training Data Development <a id='pre-processing'></a>


Goal: Create a cleaned development dataset you can use to complete the
modeling step of your project.


● Create dummy or indicator features for categorical variables

● Standardize the magnitude of numeric features using a scaler

● Address outliers for numerical data (Skewness)

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.graph_objects as go

from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer



import sys
sys.path.insert(1, '/Users/swechakranthi/Desktop/Github/DataScience-Capstone2-Housing/library')
from sb_utils import save_file

In [2]:
train_data = pd.read_csv('../data/train_data-cleaned.csv')
test_data = pd.read_csv('../data/test_data-cleaned.csv')

## First, Separate independent and Dependent data

In [3]:
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice'].to_frame()

In [4]:
#set the ID value as index so that it is not used in coorelations
X.set_index('Id',inplace=True)

test_data.set_index('Id',inplace=True)


test_data.head()



Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


## Create dummies for Catagorical Data

In [5]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [6]:
#first, get 2 lists of all numeric and catagorical columns

numeric_cols = X.select_dtypes(exclude='object').columns




categ_cols = X.dtypes[X.dtypes == np.object]        # filtering by categorical variables
categ_cols = categ_cols.index.tolist()                # list of categorical fields

categ_cols_test = test_data.dtypes[test_data.dtypes == np.object]        # filtering by categorical variables
categ_cols_test = categ_cols_test.index.tolist()  

print('Are both test and train catagorical columns same? ',categ_cols == categ_cols_test)

#convert object types into catagorical dtypes
#X[categ_cols] = X[categ_cols].astype("category")
#test_data[categ_cols_test] = test_data[categ_cols_test].astype("category")


categ_cols_test
test_data.dtypes


Are both test and train catagorical columns same?  True


MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

### We set all 

In [7]:
from sklearn.compose import make_column_selector as selector


numeric_features = ['numeric_cols']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

power_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformer', PowerTransformer())])

categorical_features = categ_cols
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('boxcox', power_transformer, numeric_cols),
        ('cat', categorical_transformer, categ_cols)])

In [8]:
test_data.shape

(1459, 79)

In [10]:
X_train = preprocessor.fit(X,y)
X_train

  x = um.multiply(x, x, out=x)


ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFu...
                                 ['MSZoning', 'Street', 'Alley', 'LotShape',
                                  'LandContour', 'Utilities', 'LotConfig',
                                  'LandSlope', 'Neighborhood', 'Condition1',
                                  'Condition2', 'BldgType', 'HouseStyle',
                                  'RoofStyle', 'RoofMatl', 'Exterior1st',
                                  'Exterior2nd', 'MasVnrType',

In [11]:
X_test = preprocessor.transform(test_data)

In [14]:
pd.set_option('display.max_rows', 90)

X_test

array([[-0.87256276,  0.46031974,  0.11076257, ...,  0.        ,
         1.        ,  0.        ],
       [-0.87256276,  0.50573272,  0.37584985, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.07337496,  0.18784181,  0.33205282, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.87256276,  4.0933588 ,  0.95042275, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66458604, -0.35711405, -0.00759964, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.07337496,  0.18784181, -0.08918038, ...,  0.        ,
         1.        ,  0.        ]])

## Standardize the numerical data

In [None]:

pd.set_option('display.max_rows', 90)

numeric_cols

In [None]:
scale= StandardScaler()

df = scale.fit_transform(X[numeric_cols])
df.head()

In [None]:
df_train.head()

## Numerical Data Skewness

In [None]:
#num_features = Train_Scaled.select_dtypes(exclude='object').columns
#num_features

sns.histplot(y, stat='density', linewidth=0, color = '#ff7f50', kde=True, alpha=0.3);


skew_limit = 0.5
skew_vals = X[numeric_cols].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit)))

skew_cols

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Normalize skewed features
for col in skew_cols.index:
    X[col] = boxcox1p(X[col], boxcox_normmax(X[col] + 1))