# Ames Housing Saleprice

## Problem Statement

Create a regression model where we are able to see what features affects the price of the house at sales the most.

## Executive Summary

### Contents:
- [6. Pre-Processing](#6.-Pre-Processing)


Links:
[Kaggle challenge link](https://www.kaggle.com/c/dsi-us-6-project-2-regression-challenge/data)

## 6. Pre Processing

In [1]:
#Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
plt.style.use('ggplot')

In [2]:
# Importing cleaned dataset for Pre Processing
df = pd.read_csv("../datasets/AHD_EDA.csv", na_filter=False)
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv('../datasets/test.csv')

df.shape, df_train.shape, df_test.shape

((2879, 38), (2051, 81), (879, 80))

In [3]:
df_train_rows = df_train['Id'].tolist()
df_test_rows = df_test['Id'].tolist()

df_train = df.loc[df['Id'].isin(df_train_rows)]
df_test = df.loc[df['Id'].isin(df_test_rows)]

df_train.shape ,df_test.shape

((2000, 38), (879, 38))

In [4]:
df.head()

Unnamed: 0,Id,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Year Built,...,Garage Area,Sale Type,SalePrice,Garage Total Value,Overall Total Value,Exter Total Value,Total Flr SF,Fireplace Total Value,Total Bath,Kitchen Total Value
0,109,Lvl,CulDSac,Sawyer,RRAe,Norm,1Fam,2Story,6,1976,...,475.0,WD,130500,10.0,14,7,1450,0,3.0,5
1,544,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,2Story,7,1996,...,559.0,WD,220000,10.0,12,7,1826,4,4.0,5
2,153,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,5,1953,...,246.0,WD,109000,8.0,12,7,2114,0,2.0,5
3,318,Lvl,Inside,Timber,Norm,Norm,1Fam,2Story,5,2006,...,400.0,WD,174000,11.0,10,6,1488,0,3.0,4
4,255,Lvl,Inside,SawyerW,Norm,Norm,1Fam,1.5Fin,6,1900,...,484.0,WD,138500,9.0,14,6,1662,0,2.0,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     2879 non-null   int64  
 1   Land Contour           2879 non-null   object 
 2   Lot Config             2879 non-null   object 
 3   Neighborhood           2879 non-null   object 
 4   Condition 1            2879 non-null   object 
 5   Condition 2            2879 non-null   object 
 6   Bldg Type              2879 non-null   object 
 7   House Style            2879 non-null   object 
 8   Overall Qual           2879 non-null   int64  
 9   Year Built             2879 non-null   int64  
 10  Year Remod/Add         2879 non-null   int64  
 11  Roof Style             2879 non-null   object 
 12  Roof Matl              2879 non-null   object 
 13  Exterior 1st           2879 non-null   object 
 14  Mas Vnr Area           2879 non-null   float64
 15  Exte

# 6.1 Scaling of Data

In [6]:
num_data = df.select_dtypes(['int64', 'float64']).keys()
num_data = [x for x in num_data if ((x != 'SalePrice') & (x != 'Id'))]

nums = df[num_data]
ss = StandardScaler()
ss.fit(nums)
nums_scaled = ss.transform(nums)

nums_scaled.shape

(2879, 23)

In [7]:
nums_scaled_pd = pd.DataFrame(nums_scaled, columns = num_data) #create pd for combining later
nums_scaled_pd.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Bsmt Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,...,Garage Finish,Garage Cars,Garage Area,Garage Total Value,Overall Total Value,Exter Total Value,Total Flr SF,Fireplace Total Value,Total Bath,Kitchen Total Value
0,-0.054655,0.158654,1.000429,1.217818,1.077984,-0.533349,-0.770345,-1.158724,-0.009989,0.815249,...,0.319071,0.327637,0.033324,0.359443,1.428063,0.774638,-1.158724,-0.983261,0.628727,0.696981
1,0.677215,0.821737,0.616667,0.237957,1.077984,0.592393,-0.310527,-0.642906,1.34142,0.815249,...,0.319071,0.327637,0.435184,0.359443,0.214188,0.774638,-0.642906,0.701493,1.714573,0.696981
2,-0.786525,-0.603891,1.09637,-0.585875,-0.688388,-0.533349,0.041674,-0.247811,-0.896917,-1.02706,...,-0.798453,-1.006544,-1.062223,-0.425714,0.214188,0.774638,-0.247811,-0.983261,-0.457119,0.696981
3,-0.786525,1.153278,1.09637,-0.585875,-0.688388,0.592393,-1.604375,-1.106593,-0.083549,0.815249,...,1.436594,0.327637,-0.32548,0.752022,-0.999686,-0.703283,-1.106593,-0.983261,0.628727,-0.839472
4,-0.054655,-2.36106,0.424785,-0.585875,-0.688388,-1.65909,-0.890191,-0.86789,-0.081448,0.815249,...,-0.798453,0.327637,0.07638,-0.033135,1.428063,-0.703283,-0.86789,-0.983261,-0.457119,-0.839472


# 6.2 One-hot encode categorical variables

- Creating dummies for dataframe

In [8]:
#selecting object dtypes to create dummies
obj_data = df.select_dtypes(['object']).keys()
print(obj_data)
len(obj_data)

Index(['Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Heating', 'Garage Type', 'Sale Type'],
      dtype='object')


13

In [9]:
obj_processed = pd.get_dummies(df[obj_data], columns = obj_data)
obj_processed

Unnamed: 0,Land Contour_Bnk,Land Contour_HLS,Land Contour_Low,Land Contour_Lvl,Lot Config_Corner,Lot Config_CulDSac,Lot Config_FR2,Lot Config_FR3,Lot Config_Inside,Neighborhood_Blmngtn,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2874,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2875,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2876,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2877,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
#remove columns with NA count values
na_col = obj_processed.filter(regex = 'NA')
na_col_keys = na_col.keys()
na_col_keys

Index(['Neighborhood_NAmes', 'Garage Type_NA'], dtype='object')

In [11]:
df_new = pd.concat([df['Id'], nums_scaled_pd, obj_processed, df['SalePrice']], ignore_index=False, sort=False, axis = 1)
df_new.shape

(2879, 144)

In [12]:
df_new.columns

Index(['Id', 'Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'Exter Qual', 'Bsmt Qual', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area',
       ...
       'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI',
       'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth', 'Sale Type_VWD',
       'Sale Type_WD ', 'SalePrice'],
      dtype='object', length=144)

In [13]:
df_new.to_csv("../datasets/AHD_PP_FE.csv", index=False)

## continue to Model Benchmarking