# Ames Housing Saleprice

## Problem Statement

Create a regression model where we are able to predict the price of the house at sales.

## Executive Summary

### Contents:
- [6. Pre-Processing](#6.-Pre-Processing)


Links:
[Kaggle challenge link](https://www.kaggle.com/c/dsi-us-6-project-2-regression-challenge/data)

## 6. Pre Processing

In [1]:
#Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
plt.style.use('ggplot')

In [2]:
# Importing cleaned dataset for Pre Processing
df_train_pp = pd.read_csv("../datasets/test_EDA_sorted.csv", na_filter=False)

In [3]:
df_train_pp.head()

Unnamed: 0,MS Zoning,Street,Alley,Neighborhood,Condition 1,Bldg Type,House Style,Overall Qual,Year Built,Year Remod/Add,...,Garage Type,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Sale Type
0,RM,Pave,Grvl,OldTown,Norm,2fmCon,2Story,6,1910,1950,...,Detchd,Unf,1,440,Po,Po,Y,0,60,WD
1,RL,Pave,,Sawyer,Norm,Duplex,1Story,5,1977,1977,...,Attchd,Fin,2,580,TA,TA,Y,170,0,WD
2,RL,Pave,,Gilbert,Norm,1Fam,2Story,7,2006,2006,...,Attchd,RFn,2,426,TA,TA,Y,100,24,New
3,RM,Pave,,OldTown,Norm,1Fam,1Story,5,1923,2006,...,Detchd,Unf,2,480,Fa,TA,N,0,0,WD
4,RL,Pave,,NAmes,Norm,1Fam,1Story,6,1963,1963,...,Attchd,RFn,2,514,TA,TA,Y,0,76,WD


In [4]:
df_train_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 48 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MS Zoning       879 non-null    object 
 1   Street          879 non-null    object 
 2   Alley           879 non-null    object 
 3   Neighborhood    879 non-null    object 
 4   Condition 1     879 non-null    object 
 5   Bldg Type       879 non-null    object 
 6   House Style     879 non-null    object 
 7   Overall Qual    879 non-null    int64  
 8   Year Built      879 non-null    int64  
 9   Year Remod/Add  879 non-null    int64  
 10  Roof Style      879 non-null    object 
 11  Exterior 1st    879 non-null    object 
 12  Exterior 2nd    879 non-null    object 
 13  Mas Vnr Area    879 non-null    float64
 14  Exter Qual      879 non-null    object 
 15  Exter Cond      879 non-null    object 
 16  Foundation      879 non-null    object 
 17  Bsmt Qual       879 non-null    obj

# 6.1 Changing Categories to Ordinal data
- Some Categorical datas can be seen as Ordinal data 
- therefore it is better to convert them for better fitting into the models

In [5]:
df_train_pp['Exter Qual'].value_counts()

TA    552
Gd    292
Ex     26
Fa      9
Name: Exter Qual, dtype: int64

In [6]:
#labels = df_train_pp['Exter Qual'].astype('category').cat.categories.tolist() 
#{'carrier' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
labels = {'Exter Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Exter Qual'].astype('int64').dtypes

dtype('int64')

In [7]:
df_train_pp['Exter Cond'].value_counts()

TA    771
Gd     84
Fa     18
Ex      5
Po      1
Name: Exter Cond, dtype: int64

In [8]:
#labels = df_train_pp['Exter Qual'].astype('category').cat.categories.tolist() 
#{'carrier' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
labels = {'Exter Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Exter Cond'].astype('int64').dtypes

dtype('int64')

In [9]:
df_train_pp['Bsmt Qual'].value_counts()

TA    396
Gd    355
Ex     74
Fa     28
NA     25
Po      1
Name: Bsmt Qual, dtype: int64

In [10]:
labels = {'Bsmt Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Qual'].astype('int64').dtypes

dtype('int64')

In [11]:
df_train_pp['Bsmt Cond'].value_counts()

TA    782
Fa     39
Gd     33
NA     25
Name: Bsmt Cond, dtype: int64

In [12]:
labels = {'Bsmt Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Cond'].astype('int64').dtypes

dtype('int64')

In [13]:
df_train_pp['Bsmt Exposure'].value_counts()

No    567
Av    130
Gd     81
Mn     76
NA     25
Name: Bsmt Exposure, dtype: int64

In [14]:
labels = {'Bsmt Exposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Bsmt Exposure'].astype('int64').dtypes

dtype('int64')

In [15]:
df_train_pp['BsmtFin Type 1'].value_counts()

Unf    248
GLQ    244
ALQ    136
Rec    105
BLQ     69
LwQ     52
NA      25
Name: BsmtFin Type 1, dtype: int64

In [16]:
labels = {'BsmtFin Type 1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['BsmtFin Type 1'].astype('int64').dtypes

dtype('int64')

In [17]:
df_train_pp['Heating QC'].value_counts() 

Ex    430
TA    267
Gd    157
Fa     25
Name: Heating QC, dtype: int64

In [18]:
labels = {'Heating QC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Heating QC'].astype('int64').dtypes

dtype('int64')

In [19]:
df_train_pp['Central Air'].value_counts() 

Y    824
N     55
Name: Central Air, dtype: int64

In [20]:
labels = {'Central Air': {'Y': 1, 'N': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Central Air'].astype('int64').dtypes

dtype('int64')

In [21]:
df_train_pp['Kitchen Qual'].value_counts() 

TA    447
Gd    354
Ex     54
Fa     23
Po      1
Name: Kitchen Qual, dtype: int64

In [22]:
labels = {'Kitchen Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Kitchen Qual'].astype('int64').dtypes

dtype('int64')

In [23]:
df_train_pp['Functional'].value_counts() 

Typ     813
Min2     28
Min1     23
Maj1      7
Mod       6
Maj2      2
Name: Functional, dtype: int64

In [24]:
labels = {'Functional': {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 
                         'Maj2': 2, 'Sev': 1, 'Sal': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Functional'].astype('int64').dtypes

dtype('int64')

In [25]:
df_train_pp['Fireplace Qu'].value_counts() 

NA    422
Gd    221
TA    193
Fa     16
Po     15
Ex     12
Name: Fireplace Qu, dtype: int64

In [26]:
labels = {'Fireplace Qu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Fireplace Qu'].astype('int64').dtypes

dtype('int64')

In [27]:
df_train_pp['Garage Finish'].value_counts() 

Unf    382
RFn    233
Fin    219
NA      45
Name: Garage Finish, dtype: int64

In [28]:
labels = {'Garage Finish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Finish'].astype('int64').dtypes

dtype('int64')

In [29]:
df_train_pp['Garage Qual'].value_counts() 

TA    783
NA     45
Fa     42
Gd      6
Po      3
Name: Garage Qual, dtype: int64

In [30]:
labels = {'Garage Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Qual'].astype('int64').dtypes

dtype('int64')

In [31]:
df_train_pp['Garage Cond'].value_counts() 

TA    797
NA     45
Fa     27
Po      6
Gd      3
Ex      1
Name: Garage Cond, dtype: int64

In [32]:
labels = {'Garage Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_train_pp.replace(labels, inplace=True)
df_train_pp['Garage Cond'].astype('int64').dtypes

dtype('int64')

In [33]:
df_train_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 48 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MS Zoning       879 non-null    object 
 1   Street          879 non-null    object 
 2   Alley           879 non-null    object 
 3   Neighborhood    879 non-null    object 
 4   Condition 1     879 non-null    object 
 5   Bldg Type       879 non-null    object 
 6   House Style     879 non-null    object 
 7   Overall Qual    879 non-null    int64  
 8   Year Built      879 non-null    int64  
 9   Year Remod/Add  879 non-null    int64  
 10  Roof Style      879 non-null    object 
 11  Exterior 1st    879 non-null    object 
 12  Exterior 2nd    879 non-null    object 
 13  Mas Vnr Area    879 non-null    float64
 14  Exter Qual      879 non-null    int64  
 15  Exter Cond      879 non-null    int64  
 16  Foundation      879 non-null    object 
 17  Bsmt Qual       879 non-null    int

## 6.1 comments:
Changed 14 ordinal categories data to int data types

# 6.2 Scaling of Data

In [34]:
num_data = df_train_pp.select_dtypes(['int64', 'float64']).keys()
num_data = [x for x in num_data if x != 'SalePrice']

nums = df_train_pp[num_data]
ss = StandardScaler()
ss.fit(nums)
nums_scaled = ss.transform(nums)

nums_scaled.shape

(879, 32)

In [35]:
nums_scaled_pd = pd.DataFrame(nums_scaled, columns = num_data) #create pd for combining later
nums_scaled_pd.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Functional,Fireplaces,Fireplace Qu,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Wood Deck SF,Open Porch SF
0,-0.039744,-1.992138,-1.684928,-0.567876,-0.678508,-2.937136,-1.596589,0.160561,-0.590655,-1.200628,...,0.250557,-0.925183,-0.992319,-0.793429,-0.991066,-0.144789,-2.57352,-2.583088,-0.772558,0.181019
1,-0.76756,0.212808,-0.364176,-0.567876,-0.678508,-0.228067,0.601378,0.160561,-0.590655,-1.200628,...,0.250557,-0.925183,-0.992319,1.434763,0.340963,0.512645,0.288117,0.274727,0.631179,-0.686412
2,0.688072,1.167188,1.054409,-0.567876,1.101815,-0.228067,0.601378,1.902942,1.283662,1.189208,...,0.250557,0.569736,1.22997,0.320667,0.340963,-0.210533,0.288117,0.274727,0.05317,-0.339439
3,-0.76756,-1.564313,1.054409,-0.567876,1.101815,-0.228067,-0.497606,0.160561,-0.590655,-1.200628,...,0.250557,-0.925183,-0.992319,-0.793429,0.340963,0.043049,-1.142701,0.274727,-0.772558,-0.686412
4,-0.039744,-0.247927,-1.04901,0.744718,-0.678508,-0.228067,0.601378,0.160561,-0.590655,0.233274,...,0.250557,2.064656,1.22997,0.320667,0.340963,0.202712,0.288117,0.274727,-0.772558,0.412334


# 6.3 One-hot encode categorical variables

- Creating dummies for dataframe

In [36]:
#selecting object dtypes to create dummies
obj_data = df_train_pp.select_dtypes(['object']).keys()
print(obj_data)
len(obj_data)

Index(['MS Zoning', 'Street', 'Alley', 'Neighborhood', 'Condition 1',
       'Bldg Type', 'House Style', 'Roof Style', 'Exterior 1st',
       'Exterior 2nd', 'Foundation', 'Heating', 'Electrical', 'Garage Type',
       'Paved Drive', 'Sale Type'],
      dtype='object')


16

In [37]:
obj_processed = pd.get_dummies(df_train_pp[obj_data], columns = obj_data)
obj_processed

Unnamed: 0,MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_NA,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
875,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
876,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
877,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [38]:
#remove columns with NA count values
na_col = obj_processed.filter(regex = 'NA')
na_col_keys = na_col.keys()
na_col_keys

Index(['Alley_NA', 'Neighborhood_NAmes', 'Garage Type_NA'], dtype='object')

In [39]:
obj_processed.drop(columns = na_col_keys, inplace = True)
obj_processed.shape 

(879, 126)

In [41]:
df_processed = pd.concat([nums_scaled_pd, obj_processed], ignore_index=False, sort=False, axis = 1)
df_processed.shape

(879, 158)

In [42]:
df_processed.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,-0.039744,-1.992138,-1.684928,-0.567876,-0.678508,-2.937136,-1.596589,0.160561,-0.590655,-1.200628,...,0,0,0,0,0,0,0,0,0,1
1,-0.76756,0.212808,-0.364176,-0.567876,-0.678508,-0.228067,0.601378,0.160561,-0.590655,-1.200628,...,0,0,0,0,0,0,0,0,0,1
2,0.688072,1.167188,1.054409,-0.567876,1.101815,-0.228067,0.601378,1.902942,1.283662,1.189208,...,0,0,0,0,0,0,1,0,0,0
3,-0.76756,-1.564313,1.054409,-0.567876,1.101815,-0.228067,-0.497606,0.160561,-0.590655,-1.200628,...,0,0,0,0,0,0,0,0,0,1
4,-0.039744,-0.247927,-1.04901,0.744718,-0.678508,-0.228067,0.601378,0.160561,-0.590655,0.233274,...,0,0,0,0,0,0,0,0,0,1


# 6.4 Exporting Data for Model

In [43]:
df_processed.to_csv("../datasets/test_processed.csv", index=False)