![banner.png](attachment:banner.png)

# Ames Housing Saleprice

## Problem Statement

Create a regression model where we are able to see what features affects the price of the house at sales the most.

## Executive Summary

### Contents:
- [6. Pre-Processing](#6.-Pre-Processing)


Links:
[Kaggle challenge link](https://www.kaggle.com/c/dsi-us-6-project-2-regression-challenge/data)

## 6. Pre Processing

In [1]:
#Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
plt.style.use('ggplot')

In [2]:
# Importing cleaned dataset for Pre Processing
df = pd.read_csv("../datasets/train_EDA.csv", na_filter=False)
df_test = pd.read_csv('../datasets/test.csv')

df.shape, df_test.shape

((2000, 38), (879, 80))

In [3]:
df.head()

Unnamed: 0,Id,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Year Built,...,Garage Area,Sale Type,SalePrice,Garage Total Value,Overall Total Value,Exter Total Value,Total Flr SF,Fireplace Total Value,Total Bath,Kitchen Total Value
0,109,Lvl,CulDSac,Sawyer,RRAe,Norm,1Fam,2Story,6,1976,...,475.0,WD,130500,10.0,14,7,1450,0,3.0,5
1,544,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,2Story,7,1996,...,559.0,WD,220000,10.0,12,7,1826,4,4.0,5
2,153,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,5,1953,...,246.0,WD,109000,8.0,12,7,2114,0,2.0,5
3,318,Lvl,Inside,Timber,Norm,Norm,1Fam,2Story,5,2006,...,400.0,WD,174000,11.0,10,6,1488,0,3.0,4
4,255,Lvl,Inside,SawyerW,Norm,Norm,1Fam,1.5Fin,6,1900,...,484.0,WD,138500,9.0,14,6,1662,0,2.0,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     2000 non-null   int64  
 1   Land Contour           2000 non-null   object 
 2   Lot Config             2000 non-null   object 
 3   Neighborhood           2000 non-null   object 
 4   Condition 1            2000 non-null   object 
 5   Condition 2            2000 non-null   object 
 6   Bldg Type              2000 non-null   object 
 7   House Style            2000 non-null   object 
 8   Overall Qual           2000 non-null   int64  
 9   Year Built             2000 non-null   int64  
 10  Year Remod/Add         2000 non-null   int64  
 11  Roof Style             2000 non-null   object 
 12  Roof Matl              2000 non-null   object 
 13  Exterior 1st           2000 non-null   object 
 14  Mas Vnr Area           2000 non-null   float64
 15  Exte

# 6.1 Scaling of Data

In [5]:
num_data = df.select_dtypes(['int64', 'float64']).keys()
num_data = [x for x in num_data if ((x != 'SalePrice') & (x != 'Id'))]

nums = df[num_data]
ss = StandardScaler()
ss.fit(nums)
nums_scaled = ss.transform(nums)

nums_scaled.shape

(2000, 23)

In [6]:
nums_scaled_pd = pd.DataFrame(nums_scaled, columns = num_data) #create pd for combining later
nums_scaled_pd.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Bsmt Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,...,Garage Finish,Garage Cars,Garage Area,Garage Total Value,Overall Total Value,Exter Total Value,Total Flr SF,Fireplace Total Value,Total Bath,Kitchen Total Value
0,-0.061266,0.149244,0.998447,1.376551,1.067726,-0.549819,-0.782514,-1.181888,0.005746,0.814567,...,0.318367,0.321786,0.03954,0.354378,1.428204,0.76795,-1.181888,-0.97529,0.642212,0.699168
1,0.672457,0.814618,0.61785,0.300454,1.067726,0.588523,-0.317516,-0.655356,1.41502,0.814567,...,0.318367,0.321786,0.44481,0.354378,0.208038,0.76795,-0.655356,0.718289,1.738137,0.699168
2,-0.794989,-0.615937,1.093596,-0.60429,-0.692745,-0.549819,0.038653,-0.252055,-0.91916,-1.055851,...,-0.800674,-1.013425,-1.065302,-0.427913,0.208038,0.76795,-0.252055,-0.97529,-0.453713,0.699168
3,-0.794989,1.147305,1.093596,-0.60429,-0.692745,0.588523,-1.625941,-1.128675,-0.070965,0.814567,...,1.437409,0.321786,-0.322308,0.745524,-1.012128,-0.710299,-1.128675,-0.97529,0.642212,-0.844249
4,-0.061266,-2.37918,0.427552,-0.60429,-0.692745,-1.688161,-0.90371,-0.885014,-0.068773,0.814567,...,-0.800674,0.321786,0.082962,-0.036768,1.428204,-0.710299,-0.885014,-0.97529,-0.453713,-0.844249


# 6.2 One-hot encode categorical variables

- Creating dummies for dataframe

In [7]:
#selecting object dtypes to create dummies
obj_data = df.select_dtypes(['object']).keys()
print(obj_data)
len(obj_data)

Index(['Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Heating', 'Garage Type', 'Sale Type'],
      dtype='object')


13

In [8]:
obj_processed = pd.get_dummies(df[obj_data], columns = obj_data)
obj_processed.head()

Unnamed: 0,Land Contour_Bnk,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Lot Config_Corner,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Neighborhood_Blmngtn,...,Garage Type_NA,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
#remove columns with NA count values
na_col = obj_processed.filter(regex = 'NA')
na_col_keys = na_col.keys()
na_col_keys

Index(['Neighborhood_NAmes', 'Garage Type_NA'], dtype='object')

In [10]:
obj_processed.drop(columns = na_col_keys, inplace = True) # dropping NA columns

In [11]:
df_new = pd.concat([df['Id'], nums_scaled_pd, obj_processed, df['SalePrice']], ignore_index=False, sort=False, axis = 1)
df_new.shape

(2000, 137)

In [12]:
df_new.columns

Index(['Id', 'Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'Exter Qual', 'Bsmt Qual', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area',
       ...
       'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD',
       'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth',
       'Sale Type_WD ', 'SalePrice'],
      dtype='object', length=137)

# Preprocess test data

In [105]:
df_test.shape

(879, 80)

In [13]:
#Copied from Data cleaning to apply on Test data
labels = {'Exter Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_test.replace(labels, inplace=True)


labels = {'Exter Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_test.replace(labels, inplace=True)

labels = {'Bsmt Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Bsmt Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)

labels = {'Bsmt Exposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'BsmtFin Type 1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)

labels = {'Heating QC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_test.replace(labels, inplace=True)


labels = {'Central Air': {'Y': 1, 'N': 0} }
df_test.replace(labels, inplace=True)


labels = {'Kitchen Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} }
df_test.replace(labels, inplace=True)

labels = {'Functional': {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 
                         'Maj2': 3, 'Sev': 2, 'Sal': 1} }
df_test.replace(labels, inplace=True)

labels = {'Fireplace Qu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Garage Finish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Garage Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Garage Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Lot Shape': {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1} }
df_test.replace(labels, inplace=True)


labels = {'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1} }
df_test.replace(labels, inplace=True)



labels = {'Land Slope': {'Gtl': 3, 'Mod': 2, 'Sev': 1} }
df_test.replace(labels, inplace=True)


labels = {'Foundation': {'BrkTil': 6, 'CBlock': 5, 'PConc': 4, 'Slab': 3, 'Stone': 2, 'Wood': 1} }
df_test.replace(labels, inplace=True)


labels = {'BsmtFin Type 2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Electrical': {'SBrkr': 5, 'FuseA': 4, 'FuseF': 3, 'FuseP': 2, 'Mix': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Paved Drive': {'Y': 2, 'P': 1, 'N': 0} }
df_test.replace(labels, inplace=True)


labels = {'Pool QC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)


labels = {'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0} }
df_test.replace(labels, inplace=True)

In [18]:
#Copied from Data cleaning to apply on Test data

df_test['Bsmt Total Value'] = (df_test['Bsmt Qual'] + df_test['Bsmt Cond'] + df_test['Bsmt Exposure']+ 
                          df_test['BsmtFin Type 1']+ df_test['BsmtFin Type 2']+ df_test['Bsmt Full Bath'])


df_test['Garage Total Value'] = (df_test['Garage Finish'] + df_test['Garage Cars'] + df_test['Garage Qual']+ 
                          df_test['Garage Cond'])


df_test['Lot Total Value'] = ((df_test['Lot Frontage'] + df_test['Lot Area']))


df_test['Overall Total Value'] = (df_test['Overall Qual'] + df_test['Overall Cond'])


df_test['Exter Total Value'] = (df_test['Exter Qual'] + df_test['Exter Cond'])


df_test['Total Flr SF'] = (df_test['1st Flr SF'] + df_test['1st Flr SF'])


df_test['Fireplace Total Value'] = (df_test['Fireplaces'] + df_test['Fireplace Qu'])


df_test['Total Porch Area'] = (df_test['Open Porch SF'] + df_test['Enclosed Porch'] +
                          df_test['3Ssn Porch'] + df_test['Screen Porch'])


df_test['Total Bath'] = (df_test['Bsmt Full Bath'] + df_test['Bsmt Half Bath'] +
                          df_test['Full Bath'] + df_test['Half Bath'])


df_test['Kitchen Total Value'] = (df_test['Kitchen AbvGr'] + df_test['Kitchen Qual'])

In [20]:
col_diff = df_test.columns.difference(df.columns)
df_test.drop(columns = col_diff, inplace = True)

Index(['2nd Flr SF', '3Ssn Porch', 'Alley', 'Bedroom AbvGr', 'Bsmt Cond',
       'Bsmt Exposure', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Bsmt Total Value',
       'Bsmt Unf SF', 'BsmtFin SF 1', 'BsmtFin SF 2', 'BsmtFin Type 1',
       'BsmtFin Type 2', 'Central Air', 'Electrical', 'Enclosed Porch',
       'Exter Cond', 'Exterior 2nd', 'Fence', 'Fireplaces', 'Foundation',
       'Functional', 'Garage Cond', 'Garage Qual', 'Garage Yr Blt',
       'Half Bath', 'Heating QC', 'Kitchen AbvGr', 'Land Slope', 'Lot Area',
       'Lot Frontage', 'Lot Shape', 'Lot Total Value', 'Low Qual Fin SF',
       'MS SubClass', 'MS Zoning', 'Mas Vnr Type', 'Misc Feature', 'Misc Val',
       'Mo Sold', 'Open Porch SF', 'Overall Cond', 'PID', 'Paved Drive',
       'Pool Area', 'Pool QC', 'Screen Porch', 'Street', 'Total Porch Area',
       'Utilities', 'Wood Deck SF', 'Yr Sold'],
      dtype='object')

# Scaling Test Data

In [26]:
test_num_data = df_test.select_dtypes(['int64', 'float64']).keys()
test_num_data = [x for x in num_data if ((x != 'SalePrice') & (x != 'Id'))]

test_nums = df_test[num_data]

test_nums_scaled = ss.transform(test_nums)

test_nums_scaled.shape

(879, 23)

In [27]:
test_nums_scaled_pd = pd.DataFrame(test_nums_scaled, columns = num_data) #create pd for combining later
test_nums_scaled_pd.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Bsmt Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,...,Garage Finish,Garage Cars,Garage Area,Garage Total Value,Overall Total Value,Exter Total Value,Total Flr SF,Fireplace Total Value,Total Bath,Kitchen Total Value
0,-0.061266,-2.046493,-1.618152,-0.60429,-0.692745,-1.688161,-0.052863,-0.66936,0.989827,0.814567,...,-0.800674,-1.013425,-0.129322,-1.992496,1.428204,-2.188548,-0.66936,,-0.453713,-0.844249
1,-0.794989,0.182512,-0.33364,-0.60429,-0.692745,0.588523,2.289441,2.296585,1.075304,0.814567,...,1.437409,0.321786,0.546128,0.745524,-1.622211,-0.710299,2.296585,,-0.453713,0.699168
2,0.672457,1.147305,1.046021,-0.60429,1.067726,0.588523,-0.958125,-1.352731,0.043005,0.814567,...,0.318367,0.321786,-0.196867,0.354378,0.208038,0.76795,-1.352731,1.141684,1.738137,0.699168
3,-0.794989,-1.613999,1.046021,-0.60429,1.067726,-0.549819,-0.181479,-0.501318,-1.114222,-1.055851,...,-0.800674,0.321786,0.063664,-0.427913,-0.402045,0.76795,-0.501318,,-1.549638,-0.844249
4,-0.061266,-0.28325,-0.999683,1.088678,-0.692745,0.588523,0.872187,0.691782,-0.18055,-1.055851,...,0.318367,0.321786,0.227701,0.354378,-0.402045,-0.710299,0.691782,1.565079,0.642212,-0.844249


# get_dummies test Data

In [28]:
test_obj_data = df_test.select_dtypes(['object']).keys()
test_obj_processed = pd.get_dummies(df_test[test_obj_data], columns = test_obj_data)
test_obj_processed.head()

Unnamed: 0,Land Contour_Bnk,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Lot Config_Corner,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Neighborhood_Blmngtn,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


# Creating processed test dataset

In [29]:
df_test_new = pd.concat([df_test['Id'], test_nums_scaled_pd, test_obj_processed], ignore_index=False, sort=False, axis = 1)
df_test_new.shape

(879, 130)

In [30]:
df_new.shape

(2000, 137)

# Exporting both datasets for model

In [31]:
df_new.to_csv("../datasets/train_preprocess.csv", index=False)
df_test_new.to_csv("../datasets/test_preprocess.csv", index=False)

## continue to Model Benchmarking