# Ames Housing Saleprice

## Problem Statement

Create a regression model where we are able to predict the price of the house at sales.

## Executive Summary

### Contents:
- [6. Pre-Processing](#6.-Pre-Processing)


Links:
[Kaggle challenge link](https://www.kaggle.com/c/dsi-us-6-project-2-regression-challenge/data)

## 6. Pre Processing

In [45]:
#Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
plt.style.use('ggplot')

In [2]:
# Importing cleaned dataset for Pre Processing
df_train_pp = pd.read_csv("./datasets/train_EDA_sorted.csv", na_filter=False)

In [3]:
df_train_pp.head()

Unnamed: 0,MS Zoning,Street,Alley,Neighborhood,Condition 1,Bldg Type,House Style,Overall Qual,Year Built,Year Remod/Add,...,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Sale Type,SalePrice
0,RL,Pave,,Sawyer,RRAe,1Fam,2Story,6,1976,2005,...,RFn,2.0,475.0,TA,TA,Y,0,44,WD,130500
1,RL,Pave,,SawyerW,Norm,1Fam,2Story,7,1996,1997,...,RFn,2.0,559.0,TA,TA,Y,0,74,WD,220000
2,RL,Pave,,NAmes,Norm,1Fam,1Story,5,1953,2007,...,Unf,1.0,246.0,TA,TA,Y,0,52,WD,109000
3,RL,Pave,,Timber,Norm,1Fam,2Story,5,2006,2007,...,Fin,2.0,400.0,TA,TA,Y,100,0,WD,174000
4,RL,Pave,,SawyerW,Norm,1Fam,1.5Fin,6,1900,1993,...,Unf,2.0,484.0,TA,TA,N,0,59,WD,138500


In [4]:
df_train_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 49 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MS Zoning       2051 non-null   object 
 1   Street          2051 non-null   object 
 2   Alley           2051 non-null   object 
 3   Neighborhood    2051 non-null   object 
 4   Condition 1     2051 non-null   object 
 5   Bldg Type       2051 non-null   object 
 6   House Style     2051 non-null   object 
 7   Overall Qual    2051 non-null   int64  
 8   Year Built      2051 non-null   int64  
 9   Year Remod/Add  2051 non-null   int64  
 10  Roof Style      2051 non-null   object 
 11  Exterior 1st    2051 non-null   object 
 12  Exterior 2nd    2051 non-null   object 
 13  Mas Vnr Area    2051 non-null   float64
 14  Exter Qual      2051 non-null   object 
 15  Exter Cond      2051 non-null   object 
 16  Foundation      2051 non-null   object 
 17  Bsmt Qual       2051 non-null   o

# 6.1 Changing Categories to Ordinal data
- Some Categorical datas can be seen as Ordinal data 
- therefore it is better to convert them for better fitting into the models

In [5]:
df_train_pp['Exter Qual'].value_counts()

TA    1247
Gd     697
Ex      81
Fa      26
Name: Exter Qual, dtype: int64

In [6]:
#labels = df_train_pp['Exter Qual'].astype('category').cat.categories.tolist() 
#{'carrier' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
labels = {'Exter Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Exter Qual'].astype('int64').dtypes

dtype('int64')

In [7]:
df_train_pp['Exter Cond'].value_counts()

TA    1778
Gd     215
Fa      49
Ex       7
Po       2
Name: Exter Cond, dtype: int64

In [8]:
#labels = df_train_pp['Exter Qual'].astype('category').cat.categories.tolist() 
#{'carrier' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
labels = {'Exter Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Exter Cond'].astype('int64').dtypes

dtype('int64')

In [9]:
df_train_pp['Bsmt Qual'].value_counts()

TA    887
Gd    864
Ex    184
Fa     60
NA     55
Po      1
Name: Bsmt Qual, dtype: int64

In [10]:
labels = {'Bsmt Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Qual'].astype('int64').dtypes

dtype('int64')

In [11]:
df_train_pp['Bsmt Cond'].value_counts()

TA    1834
Gd      89
Fa      65
NA      55
Po       5
Ex       3
Name: Bsmt Cond, dtype: int64

In [12]:
labels = {'Bsmt Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Cond'].astype('int64').dtypes

dtype('int64')

In [13]:
df_train_pp['Bsmt Exposure'].value_counts()

No    1339
Av     288
Gd     203
Mn     163
NA      58
Name: Bsmt Exposure, dtype: int64

In [14]:
labels = {'Bsmt Exposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Exposure'].astype('int64').dtypes

dtype('int64')

In [15]:
df_train_pp['BsmtFin Type 1'].value_counts()

GLQ    615
Unf    603
ALQ    293
BLQ    200
Rec    183
LwQ    102
NA      55
Name: BsmtFin Type 1, dtype: int64

In [16]:
labels = {'BsmtFin Type 1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['BsmtFin Type 1'].astype('int64').dtypes

dtype('int64')

In [17]:
df_train_pp['Heating QC'].value_counts() 

Ex    1065
TA     597
Gd     319
Fa      67
Po       3
Name: Heating QC, dtype: int64

In [18]:
labels = {'Heating QC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Heating QC'].astype('int64').dtypes

dtype('int64')

In [19]:
df_train_pp['Central Air'].value_counts() 

Y    1910
N     141
Name: Central Air, dtype: int64

In [20]:
labels = {'Central Air': {'Y': 1, 'N': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Central Air'].astype('int64').dtypes

dtype('int64')

In [21]:
df_train_pp['Kitchen Qual'].value_counts() 

TA    1047
Gd     806
Ex     151
Fa      47
Name: Kitchen Qual, dtype: int64

In [22]:
labels = {'Kitchen Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Kitchen Qual'].astype('int64').dtypes

dtype('int64')

In [23]:
df_train_pp['Functional'].value_counts() 

Typ     1915
Min1      42
Min2      42
Mod       29
Maj1      12
Maj2       7
Sal        2
Sev        2
Name: Functional, dtype: int64

In [24]:
labels = {'Functional': {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 
                         'Maj2': 2, 'Sev': 1, 'Sal': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Functional'].astype('int64').dtypes

dtype('int64')

In [25]:
df_train_pp['Fireplace Qu'].value_counts() 

NA    1000
Gd     523
TA     407
Fa      59
Ex      31
Po      31
Name: Fireplace Qu, dtype: int64

In [26]:
labels = {'Fireplace Qu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Fireplace Qu'].astype('int64').dtypes

dtype('int64')

In [27]:
df_train_pp['Garage Finish'].value_counts() 

Unf    849
RFn    579
Fin    509
NA     114
Name: Garage Finish, dtype: int64

In [28]:
labels = {'Garage Finish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Finish'].astype('int64').dtypes

dtype('int64')

In [29]:
df_train_pp['Garage Qual'].value_counts() 

TA    1832
NA     114
Fa      82
Gd      18
Ex       3
Po       2
Name: Garage Qual, dtype: int64

In [30]:
labels = {'Garage Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Qual'].astype('int64').dtypes

dtype('int64')

In [31]:
df_train_pp['Garage Cond'].value_counts() 

TA    1868
NA     114
Fa      47
Gd      12
Po       8
Ex       2
Name: Garage Cond, dtype: int64

In [32]:
labels = {'Garage Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Cond'].astype('int64').dtypes

dtype('int64')

In [33]:
df_train_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 49 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MS Zoning       2051 non-null   object 
 1   Street          2051 non-null   object 
 2   Alley           2051 non-null   object 
 3   Neighborhood    2051 non-null   object 
 4   Condition 1     2051 non-null   object 
 5   Bldg Type       2051 non-null   object 
 6   House Style     2051 non-null   object 
 7   Overall Qual    2051 non-null   int64  
 8   Year Built      2051 non-null   int64  
 9   Year Remod/Add  2051 non-null   int64  
 10  Roof Style      2051 non-null   object 
 11  Exterior 1st    2051 non-null   object 
 12  Exterior 2nd    2051 non-null   object 
 13  Mas Vnr Area    2051 non-null   float64
 14  Exter Qual      2051 non-null   int64  
 15  Exter Cond      2051 non-null   int64  
 16  Foundation      2051 non-null   object 
 17  Bsmt Qual       2051 non-null   i

## 6.1 comments:
Changed 14 ordinal categories data to int data types

In [54]:
bsmt_cat = df_train_pp.filter(regex = 'Bsmt').columns

In [57]:
lr = LinearRegression()
y = df_train_pp['SalePrice']
X = df_train_pp[bsmt_cat]
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(X)
pd.DataFrame(X_poly, columns=poly.get_feature_names(bsmt_cat))

Unnamed: 0,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,Bsmt Unf SF,Total Bsmt SF,Bsmt Qual^2,Bsmt Qual Bsmt Cond,Bsmt Qual Bsmt Exposure,...,BsmtFin Type 1^2,BsmtFin Type 1 BsmtFin SF 1,BsmtFin Type 1 Bsmt Unf SF,BsmtFin Type 1 Total Bsmt SF,BsmtFin SF 1^2,BsmtFin SF 1 Bsmt Unf SF,BsmtFin SF 1 Total Bsmt SF,Bsmt Unf SF^2,Bsmt Unf SF Total Bsmt SF,Total Bsmt SF^2
0,3.0,3.0,1.0,6.0,533.0,192.0,725.0,9.0,9.0,3.0,...,36.0,3198.0,1152.0,4350.0,284089.0,102336.0,386425.0,36864.0,139200.0,525625.0
1,4.0,3.0,1.0,6.0,637.0,276.0,913.0,16.0,12.0,4.0,...,36.0,3822.0,1656.0,5478.0,405769.0,175812.0,581581.0,76176.0,251988.0,833569.0
2,3.0,3.0,1.0,6.0,731.0,326.0,1057.0,9.0,9.0,3.0,...,36.0,4386.0,1956.0,6342.0,534361.0,238306.0,772667.0,106276.0,344582.0,1117249.0
3,4.0,3.0,1.0,1.0,0.0,384.0,384.0,16.0,12.0,4.0,...,1.0,0.0,384.0,384.0,0.0,0.0,0.0,147456.0,147456.0,147456.0
4,2.0,4.0,1.0,1.0,0.0,676.0,676.0,4.0,8.0,2.0,...,1.0,0.0,676.0,676.0,0.0,0.0,0.0,456976.0,456976.0,456976.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,4.0,3.0,3.0,6.0,1011.0,873.0,1884.0,16.0,12.0,12.0,...,36.0,6066.0,5238.0,11304.0,1022121.0,882603.0,1904724.0,762129.0,1644732.0,3549456.0
2047,3.0,3.0,1.0,4.0,262.0,599.0,861.0,9.0,9.0,3.0,...,16.0,1048.0,2396.0,3444.0,68644.0,156938.0,225582.0,358801.0,515739.0,741321.0
2048,3.0,3.0,1.0,1.0,0.0,896.0,896.0,9.0,9.0,3.0,...,1.0,0.0,896.0,896.0,0.0,0.0,0.0,802816.0,802816.0,802816.0
2049,3.0,3.0,1.0,3.0,155.0,295.0,1200.0,9.0,9.0,3.0,...,9.0,465.0,885.0,3600.0,24025.0,45725.0,186000.0,87025.0,354000.0,1440000.0


In [60]:
(cross_val_score(lr, X_poly, y, cv=5)).mean() , (cross_val_score(lr, X, y, cv=5)).mean()

(0.6595435824205642, 0.5449584427214207)

# 6.2 Scaling of Data

In [34]:
num_data = df_train_pp.select_dtypes(['int64', 'float64']).keys()
num_data = [x for x in num_data if x != 'SalePrice']

nums = df_train_pp[num_data]
ss = StandardScaler()
ss.fit(nums)
nums_scaled = ss.transform(nums)

nums_scaled.shape

(2051, 32)

In [35]:
nums_scaled_pd = pd.DataFrame(nums_scaled, columns = num_data) #create pd for combining later
nums_scaled_pd.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Functional,Fireplaces,Fireplace Qu,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Wood Deck SF,Open Porch SF
0,-0.078644,0.142227,0.989479,1.092329,1.010273,-0.230243,-0.543594,0.123625,-0.582816,1.14603,...,0.229782,-0.925701,-0.976141,0.308137,0.293112,0.007216,0.273171,0.264923,-0.730121,-0.053301
1,0.622656,0.805126,0.60909,0.191491,1.010273,-0.230243,0.565771,0.123625,-0.582816,1.14603,...,0.229782,0.640811,0.684405,0.308137,0.293112,0.395957,0.273171,0.264923,-0.730121,0.396266
2,-0.779944,-0.620106,1.084576,-0.565901,-0.690934,2.452869,-0.543594,0.123625,-0.582816,1.14603,...,0.229782,-0.925701,-0.976141,-0.804519,-1.013786,-1.052565,0.273171,0.264923,-0.730121,0.066584
3,-0.779944,1.136575,1.084576,-0.565901,-0.690934,-0.230243,0.565771,0.123625,-0.582816,-1.20683,...,0.229782,-0.925701,-0.976141,1.420793,0.293112,-0.339874,0.273171,0.264923,0.04798,-0.712665
4,-0.078644,-2.376787,0.418896,-0.565901,-0.690934,-0.230243,-1.652959,1.872274,-0.582816,-1.20683,...,0.229782,-0.925701,-0.976141,-0.804519,0.293112,0.048867,0.273171,0.264923,-0.730121,0.171483


# 6.3 One-hot encode categorical variables

- Creating dummies for dataframe

In [36]:
#selecting object dtypes to create dummies
obj_data = df_train_pp.select_dtypes(['object']).keys()
print(obj_data)
len(obj_data)

Index(['MS Zoning', 'Street', 'Alley', 'Neighborhood', 'Condition 1',
       'Bldg Type', 'House Style', 'Roof Style', 'Exterior 1st',
       'Exterior 2nd', 'Foundation', 'Heating', 'Electrical', 'Garage Type',
       'Paved Drive', 'Sale Type'],
      dtype='object')


16

In [37]:
obj_processed = pd.get_dummies(df_train_pp[obj_data], columns = obj_data)
obj_processed

Unnamed: 0,MS Zoning_A (agr),MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,...,Paved Drive_Y,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2047,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2048,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2049,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [38]:
#remove columns with NA count values
na_col = obj_processed.filter(regex = 'NA')
na_col_keys = na_col.keys()
na_col_keys

Index(['Alley_NA', 'Neighborhood_NAmes', 'Garage Type_NA'], dtype='object')

In [39]:
obj_processed.drop(columns = na_col_keys, inplace = True)
obj_processed.shape 

(2051, 130)

In [40]:
df_processed = pd.concat([nums_scaled_pd, obj_processed, df_train_pp['SalePrice']], ignore_index=False, sort=False, axis = 1)
df_processed.shape

(2051, 163)

In [41]:
df_processed.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD,SalePrice
0,-0.078644,0.142227,0.989479,1.092329,1.010273,-0.230243,-0.543594,0.123625,-0.582816,1.14603,...,0,0,0,0,0,0,0,0,1,130500
1,0.622656,0.805126,0.60909,0.191491,1.010273,-0.230243,0.565771,0.123625,-0.582816,1.14603,...,0,0,0,0,0,0,0,0,1,220000
2,-0.779944,-0.620106,1.084576,-0.565901,-0.690934,2.452869,-0.543594,0.123625,-0.582816,1.14603,...,0,0,0,0,0,0,0,0,1,109000
3,-0.779944,1.136575,1.084576,-0.565901,-0.690934,-0.230243,0.565771,0.123625,-0.582816,-1.20683,...,0,0,0,0,0,0,0,0,1,174000
4,-0.078644,-2.376787,0.418896,-0.565901,-0.690934,-0.230243,-1.652959,1.872274,-0.582816,-1.20683,...,0,0,0,0,0,0,0,0,1,138500


# 6.4 Exporting Data for Model

In [42]:
df_processed.to_csv("./datasets/train_processed.csv", index=False)