# House Price Data - Cleaning, Encoding

Here, handle missing (some Iterative Imputer) and then encoding ordinals and using OneHot for nominals.

**Q:** would it be possible to use encode/OneHot first, and then IterativeImputer for all missing values?
- difficulty encoding missing values 
- might need to impute and then "unimpute" for IterativeImputer

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [2]:
housePrice = pd.read_csv('Ames_HousePrice.csv', low_memory=False, index_col='PID')

In [3]:
# dictionary of housePrice column meanings for quick reference
with open('housePrice_features') as hp_feat:
    hp_dict = {}
    for line in hp_feat.readlines():
        feature, description = [*line.split(':')]
        hp_dict[feature] = description

In [4]:
hp_dict['LotFrontage']

' Linear feet of street connected to property\n'

In [5]:
housePrice = housePrice.drop(columns='Unnamed: 0')

#### Imputing Missing Values

In [6]:
# function for viewing columns with missing values
def nas(df):
    return df.loc[:, df.isna().sum() > 0 ].isna().sum().sort_values(ascending = False)

In [7]:
# investigate / impute column NaNs one by one
nas(housePrice)

PoolQC          2571
MiscFeature     2483
Alley           2412
Fence           2055
FireplaceQu     1241
LotFrontage      462
GarageCond       129
GarageQual       129
GarageFinish     129
GarageYrBlt      129
GarageType       127
BsmtExposure      71
BsmtFinType2      70
BsmtFinType1      69
BsmtQual          69
BsmtCond          69
MasVnrArea        14
MasVnrType        14
BsmtHalfBath       2
BsmtFullBath       2
TotalBsmtSF        1
BsmtUnfSF          1
GarageCars         1
GarageArea         1
BsmtFinSF2         1
BsmtFinSF1         1
Electrical         1
dtype: int64

In [8]:
hp_dict['PoolQC']             
housePrice.PoolQC.unique()
housePrice.PoolQC = housePrice.PoolQC.fillna('none')
housePrice.groupby('PoolQC')['SalePrice'].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,mean,std,count
PoolQC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,465000.0,212132.034356,2
Fa,215500.0,48790.367902,2
Gd,215500.0,62932.503526,2
TA,170500.0,51529.117982,3
none,177786.980941,74571.704907,2571


In [9]:
hp_dict['MiscFeature']                                                           # drop
housePrice.groupby('MiscFeature')['SalePrice'].agg(['mean', 'std', 'count'])
housePrice = housePrice.drop('MiscFeature', axis = 1)

In [10]:
hp_dict['Alley']
# housePrice.Alley.unique()
housePrice.Alley.value_counts()
housePrice.Alley = housePrice.Alley.fillna('none')
housePrice.groupby('Alley')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
Alley,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Grvl,105,126345.990476,36101.160174
Pave,63,178559.698413,45650.14634
none,2412,180297.776949,76096.323125


In [11]:
hp_dict['Fence']  
# Good Privacy - GdPr
# Good Wood - GdWo
# Minimum Privacy - MnPr
# Minimum Wood - MnWw
housePrice.Fence = housePrice.Fence.fillna('none')
housePrice.groupby('Fence')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
Fence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GdPrv,111,176726.576577,62331.695505
GdWo,103,144750.223301,47246.448542
MnPrv,301,146485.810631,44748.238487
MnWw,10,131965.0,22333.856337
none,2055,184650.134307,78736.30216


In [12]:
hp_dict['FireplaceQu']
housePrice.FireplaceQu = housePrice.FireplaceQu.fillna('none')
housePrice.groupby('FireplaceQu')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
FireplaceQu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,35,340444.542857,112432.168836
Fa,70,168482.142857,36079.397184
Gd,619,223019.756058,91182.521787
Po,43,142528.674419,34116.467478
TA,572,204473.805944,63380.868389
none,1241,140650.717164,41618.136667


In [13]:
hp_dict['LotFrontage']                                 # impute using LotArea
housePrice.loc[housePrice.LotFrontage == 31]
housePrice.groupby('LotFrontage')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
LotFrontage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21.0,46,95106.304348,13631.687979
22.0,1,217500.000000,
24.0,49,144108.163265,20526.251857
25.0,1,260000.000000,
26.0,3,183666.666667,6506.407099
...,...,...,...
168.0,1,274725.000000,
174.0,1,403000.000000,
195.0,1,155000.000000,
200.0,1,130000.000000,


#### [Iterative Imputing](https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/)

In [14]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

Lot_df = housePrice[['LotFrontage','LotArea']]
imputer = IterativeImputer()
imputer.fit(Lot_df)
LotTrans = imputer.transform(Lot_df)
Lot_Imputed = pd.DataFrame(LotTrans)
housePrice['LotFrontage'] = Lot_Imputed[0]

In [15]:
nas(housePrice)

LotFrontage     2580
GarageYrBlt      129
GarageQual       129
GarageFinish     129
GarageCond       129
GarageType       127
BsmtExposure      71
BsmtFinType2      70
BsmtQual          69
BsmtCond          69
BsmtFinType1      69
MasVnrArea        14
MasVnrType        14
BsmtHalfBath       2
BsmtFullBath       2
Electrical         1
BsmtUnfSF          1
BsmtFinSF2         1
GarageCars         1
GarageArea         1
BsmtFinSF1         1
TotalBsmtSF        1
dtype: int64

In [16]:
hp_dict['GarageCond']                                                       
housePrice.GarageCond = housePrice.GarageCond.fillna('none')
housePrice.groupby('GarageCond')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
GarageCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,3,125000.0,3905.124838
Fa,67,108543.791045,37994.75126
Gd,14,188278.571429,72267.145196
Po,11,106618.181818,27559.238675
TA,2356,184277.848048,74495.805841
none,129,106814.604651,33428.25129


In [17]:
hp_dict['GarageQual']                                             # should we try to compile garage? (is that clusters?)
housePrice.GarageQual = housePrice.GarageQual.fillna('none')              # compare relationship between rankings 
housePrice.groupby('GarageQual')['SalePrice'].agg(['count','mean','std'])           # across garage classifications

Unnamed: 0_level_0,count,mean,std
GarageQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,3,241000.0,202680.166765
Fa,110,118884.1,39721.189121
Gd,22,242833.5,119708.930561
Po,4,94350.0,31556.140448
TA,2312,184297.039792,73708.905056
none,129,106814.604651,33428.25129


In [18]:
hp_dict['GarageFinish']                                             
housePrice.GarageFinish = housePrice.GarageFinish.fillna('none')
housePrice.groupby('GarageFinish')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
GarageFinish,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fin,621,231925.247987,89751.688867
RFn,718,199724.56546,64714.442479
Unf,1112,142254.430755,44088.459471
none,129,106814.604651,33428.25129


In [19]:
hp_dict['GarageYrBlt']                                                         
housePrice.GarageYrBlt = housePrice.GarageYrBlt.fillna(0)
housePrice.groupby('GarageYrBlt')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
GarageYrBlt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,129,106814.604651,33428.251290
1895.0,1,89000.000000,
1900.0,5,124760.000000,31569.494769
1906.0,1,135000.000000,
1908.0,1,240000.000000,
...,...,...,...
2006.0,67,263140.507463,82094.024392
2007.0,67,258978.552239,98539.246379
2008.0,44,292216.795455,87551.849537
2009.0,12,297876.666667,126085.057796


In [20]:
hp_dict['GarageType']                                             
housePrice.GarageType = housePrice.GarageType.fillna('none')
housePrice.groupby('GarageType')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
GarageType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2Types,21,154080.952381,36478.125213
Attchd,1527,200661.56909,75910.133901
Basment,27,155150.0,33391.342153
BuiltIn,153,233054.464052,80768.842199
CarPort,9,110716.666667,27859.446333
Detchd,716,133291.765363,39764.381342
none,127,106048.622047,33116.655928


In [21]:
hp_dict['BsmtExposure']                                             
housePrice.BsmtExposure = housePrice.BsmtExposure.fillna('none')
housePrice.groupby('BsmtExposure')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
BsmtExposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Av,344,201482.340116,76537.790904
Gd,242,262873.061983,113165.344846
Mn,215,185463.330233,70096.651361
No,1708,163135.738876,57349.711224
none,71,112086.816901,42548.995226


In [22]:
hp_dict['BsmtFinType2']                                             
housePrice.BsmtFinType2 = housePrice.BsmtFinType2.fillna('none')
housePrice.groupby('BsmtFinType2')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
BsmtFinType2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALQ,53,188644.339623,82192.691433
BLQ,64,161466.140625,57560.260186
GLQ,32,211278.125,101942.759031
LwQ,84,160725.297619,38958.99093
Rec,97,164338.402062,55684.315058
Unf,2180,181206.894954,76175.880106
none,70,111830.914286,44695.624964


In [23]:
hp_dict['BsmtFinType1']                                             
housePrice.BsmtFinType1 = housePrice.BsmtFinType1.fillna('none')
housePrice.groupby('BsmtFinType1')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
BsmtFinType1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALQ,397,163515.198992,48331.383704
BLQ,251,150852.749004,44819.815866
GLQ,753,231889.36919,87608.901286
LwQ,139,153106.834532,55352.19881
Rec,265,144782.030189,49548.992468
Unf,706,162617.998584,63191.379338
none,69,109335.710145,39810.10668


In [24]:
hp_dict['BsmtQual']                                            # does it matter if some basements (garages) don't have           
housePrice.BsmtQual = housePrice.BsmtQual.fillna('none')       # certain classifications, and are seen as 'none'
housePrice.groupby('BsmtQual')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
BsmtQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,188,327308.005319,102928.298214
Fa,78,113467.051282,38699.187921
Gd,1077,200961.085422,56495.645593
Po,2,84950.0,5727.564928
TA,1166,141389.656947,40290.731298
none,69,109335.710145,39810.10668


In [25]:
hp_dict['BsmtCond']                                             
housePrice.BsmtCond = housePrice.BsmtCond.fillna('none')
housePrice.groupby('BsmtCond')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
BsmtCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ex,3,195000.0,83216.584885
Fa,88,122382.875,44683.700374
Gd,101,206709.128713,69427.40786
Po,3,95133.333333,34428.70653
TA,2316,181058.699914,74981.038735
none,69,109335.710145,39810.10668


In [26]:
hp_dict['MasVnrType']
housePrice.MasVnrType = housePrice.MasVnrType.fillna('unknown')
housePrice.groupby('MasVnrType')['SalePrice'].agg(['count','mean','std'])

Unnamed: 0_level_0,count,mean,std
MasVnrType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BrkCmn,20,145450.2,43092.544242
BrkFace,804,208615.353234,82393.220408
,1559,154365.317511,54210.546901
Stone,183,246845.519126,101989.391799
unknown,14,209272.857143,51066.242866


In [27]:
# iterative imputer again, using area-related columns
hp_dict['MasVnrArea']
size_related = housePrice.filter(regex='SF|Area$')

impute = IterativeImputer(max_iter=100)
size_trans = impute.fit_transform(size_related)
size_trans = pd.DataFrame(size_trans)
housePrice.MasVnrArea = size_trans[2]
housePrice.BsmtFinSF1 = size_trans[3]
housePrice.BsmtFinSF2 = size_trans[4]
housePrice.BsmtUnfSF = size_trans[5]
housePrice.TotalBsmtSF = size_trans[6]
housePrice.GarageArea = size_trans[8]



In [28]:
list(enumerate(size_related))

[(0, 'GrLivArea'),
 (1, 'LotArea'),
 (2, 'MasVnrArea'),
 (3, 'BsmtFinSF1'),
 (4, 'BsmtFinSF2'),
 (5, 'BsmtUnfSF'),
 (6, 'TotalBsmtSF'),
 (7, '1stFlrSF'),
 (8, '2ndFlrSF'),
 (9, 'LowQualFinSF'),
 (10, 'GarageArea'),
 (11, 'WoodDeckSF'),
 (12, 'OpenPorchSF'),
 (13, 'PoolArea')]

In [29]:
nas(housePrice)

LotFrontage     2580
MasVnrArea      2580
BsmtFinSF1      2580
BsmtFinSF2      2580
BsmtUnfSF       2580
TotalBsmtSF     2580
GarageArea      2580
BsmtFullBath       2
BsmtHalfBath       2
Electrical         1
GarageCars         1
dtype: int64

In [30]:
hp_dict['BsmtHalfBath']
housePrice.groupby('BsmtFullBath')['SalePrice'].agg(['count','mean','std'])
housePrice[housePrice['BsmtFullBath'].isna()]

bath_df = housePrice.filter(regex='Bath$')

impute = IterativeImputer(max_iter=100)
bath_trans = impute.fit_transform(bath_df)
bath_trans = pd.DataFrame(bath_trans)
housePrice.BsmtFullBath = bath_trans[0]
housePrice.BsmtHalfBath = bath_trans[1]

In [31]:
list(enumerate(bath_df))

[(0, 'BsmtFullBath'), (1, 'BsmtHalfBath'), (2, 'FullBath'), (3, 'HalfBath')]

In [32]:
nas(housePrice)

LotFrontage     2580
MasVnrArea      2580
BsmtFinSF1      2580
BsmtFinSF2      2580
BsmtUnfSF       2580
TotalBsmtSF     2580
BsmtFullBath    2580
BsmtHalfBath    2580
GarageArea      2580
Electrical         1
GarageCars         1
dtype: int64

In [33]:
hp_dict['Electrical']
housePrice[housePrice['Electrical'].isna()]
housePrice.groupby('Electrical')['SalePrice'].agg(['count','mean','std'])
housePrice['Electrical'] = housePrice['Electrical'].fillna('SBrkr')

In [34]:
hp_dict['GarageCars']
housePrice[housePrice['GarageCars'].isna()]
housePrice.groupby('GarageCars')['SalePrice'].agg(['count','mean','std'])

garage = housePrice[['GarageCars', 'GarageArea']]

impute = IterativeImputer(max_iter=100)
garage_trans = impute.fit_transform(garage)
garage_trans = pd.DataFrame(garage_trans)
housePrice.GarageCars = garage_trans[0]

In [35]:
nas(housePrice)

LotFrontage     2580
MasVnrArea      2580
BsmtFinSF1      2580
BsmtFinSF2      2580
BsmtUnfSF       2580
TotalBsmtSF     2580
BsmtFullBath    2580
BsmtHalfBath    2580
GarageCars      2580
GarageArea      2580
dtype: int64

In [36]:
housePrice.to_csv('HousePrice_Clean.csv')

#### Date Columns

In [37]:
# DATES - dtypes        ### thoughts about combining columns / datetime?

date_ = housePrice.filter(regex='Yr|Mo|Date')
date_.isna().sum()

housePrice['GarageYrBlt'].dtypes                       
housePrice['GarageYrBlt'].astype('int')
# pd.to_datetime(housePrice.GarageYrBlt, format = '%Y')

PID
909176150    1939
905476230    1984
911128020    1930
535377150    1940
534177230    2001
             ... 
903205040    1916
905402060    1955
909275030    1949
907192040    2000
906223180    1993
Name: GarageYrBlt, Length: 2580, dtype: int32

#### Encoding ordinals and One Hot Encoding for nominals

In [38]:
housePrice = pd.read_csv('HousePrice_Clean.csv', index_col='PID')

In [39]:
hp_num = housePrice.select_dtypes(include = [np.number])
hp_cat = housePrice.select_dtypes(exclude = [np.number])
hp_cat_ord = []
hp_cat_nom = []
dropped = []

In [40]:
# how many numeric and categorical values
print("numeric", hp_num.shape[1])
print("categorical", hp_cat.shape[1])

numeric 37
categorical 42


In [41]:
housePrice.groupby('MSZoning')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('MSZoning')
len(hp_cat_nom)

1

In [42]:
housePrice.groupby('Street')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Street')
hp_cat_nom

['MSZoning', 'Street']

In [43]:
housePrice.groupby('Alley')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Alley')
hp_cat_nom

['MSZoning', 'Street', 'Alley']

In [44]:
housePrice.groupby('LotShape')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('LotShape')
hp_cat_nom

['MSZoning', 'Street', 'Alley', 'LotShape']

In [45]:
housePrice.groupby('LandContour')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('LandContour')
hp_cat_nom

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']

In [46]:
housePrice.groupby('Utilities')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Utilities')
hp_cat_nom

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities']

In [47]:
housePrice.groupby('LotConfig')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('LotConfig')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig']

In [48]:
housePrice.groupby('LandSlope')['SalePrice'].agg(['count','mean','std'])  # ordinal --> create scale map dict 
hp_cat_ord.append('LandSlope')
scale = {"Gtl":1, "Mod":2, "Sev":3}
housePrice['LandSlope_Enc'] = housePrice['LandSlope'].replace(scale)
housePrice = housePrice.drop(columns='LandSlope')
hp_cat_ord

['LandSlope']

In [49]:
housePrice.groupby('Neighborhood')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Neighborhood')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood']

In [50]:
housePrice.groupby('Condition1')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Condition1')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1']

In [51]:
housePrice.groupby('Condition2')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Condition2')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2']

In [52]:
housePrice.groupby('BldgType')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('BldgType')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType']

In [53]:
housePrice.groupby('HouseStyle')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('HouseStyle')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle']

In [54]:
housePrice.groupby('RoofStyle')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('RoofStyle')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle']

In [55]:
housePrice.groupby('RoofMatl')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('RoofMatl')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl']

In [56]:
housePrice.groupby('Exterior1st')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Exterior1st')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st']

In [57]:
housePrice.groupby('Exterior2nd')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Exterior2nd')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd']

In [58]:
housePrice.groupby('MasVnrType')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('MasVnrType')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType']

In [59]:
housePrice.groupby('ExterQual')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('ExterQual')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['ExterQual_Enc'] = housePrice['ExterQual'].replace(scale)
housePrice = housePrice.drop(columns='ExterQual')
hp_cat_ord

['LandSlope', 'ExterQual']

In [60]:
housePrice.groupby('ExterCond')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('ExterCond')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['ExterCond_Enc'] = housePrice['ExterCond'].replace(scale)
housePrice = housePrice.drop(columns='ExterCond')
hp_cat_ord

['LandSlope', 'ExterQual', 'ExterCond']

In [61]:
housePrice.groupby('Foundation')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Foundation')
hp_cat_nom

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation']

In [62]:
housePrice.groupby('BsmtQual')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('BsmtQual')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['BsmtQual_Enc'] = housePrice['BsmtQual'].replace(scale)
housePrice = housePrice.drop(columns='BsmtQual')
hp_cat_ord

['LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual']

In [63]:
housePrice.groupby('BsmtCond')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('BsmtCond')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['BsmtCond_Enc'] = housePrice['BsmtCond'].replace(scale)
housePrice = housePrice.drop(columns='BsmtCond')
hp_cat_ord

['LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond']

In [64]:
housePrice.groupby('BsmtExposure')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('BsmtExposure')
scale = {"none":0, "No":1, "Mn": 2, "Av":3, "Gd":4, "Ex":5}
housePrice['BsmtExposure_Enc'] = housePrice['BsmtExposure'].replace(scale)
housePrice = housePrice.drop(columns='BsmtExposure')
len(hp_cat_ord)

6

In [65]:
housePrice.groupby('BsmtFinType1')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('BsmtFinType1')
scale = {"none":0, "Unf":1, "LwQ": 2, "Rec":3, "BLQ":4, "ALQ":5, "GLQ":6}
housePrice['BsmtFinType1_Enc'] = housePrice['BsmtFinType1'].replace(scale)
housePrice = housePrice.drop(columns='BsmtFinType1')
len(hp_cat_ord)

7

In [66]:
housePrice.groupby('BsmtFinType2')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('BsmtFinType2')
scale = {"none":0, "Unf":1, "LwQ": 2, "Rec":3, "BLQ":4, "ALQ":5, "GLQ":6}
housePrice['BsmtFinType2_Enc'] = housePrice['BsmtFinType2'].replace(scale)
housePrice = housePrice.drop(columns='BsmtFinType2')
hp_cat_ord

['LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2']

In [67]:
housePrice.groupby('Heating')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Heating')
len(hp_cat_nom)

19

In [68]:
housePrice.groupby('HeatingQC')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('HeatingQC')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['HeatingQC_Enc'] = housePrice['HeatingQC'].replace(scale)
housePrice = housePrice.drop(columns='HeatingQC')
hp_cat_ord

['LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC']

In [69]:
housePrice.groupby('CentralAir')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('CentralAir')
len(hp_cat_nom)

20

In [70]:
housePrice.groupby('Electrical')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('Electrical')
len(hp_cat_nom)

21

In [71]:
housePrice.groupby('KitchenQual')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('KitchenQual')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['KitchenQual_Enc'] = housePrice['KitchenQual'].replace(scale)
housePrice = housePrice.drop(columns='KitchenQual')
hp_cat_ord

['LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'KitchenQual']

In [72]:
housePrice.groupby('Functional')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('Functional')
scale = {"Typ":0, "Min1":1, "Min2": 2, "Mod":3, "Maj1":4, "Maj2":5, "Sal":6}
housePrice['Functional_Enc'] = housePrice['Functional'].replace(scale)
housePrice = housePrice.drop(columns='Functional')
hp_cat_ord

['LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'KitchenQual',
 'Functional']

In [73]:
housePrice.groupby('FireplaceQu')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('FireplaceQu')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['FireplaceQu_Enc'] = housePrice['FireplaceQu'].replace(scale)
housePrice = housePrice.drop(columns='FireplaceQu')
hp_cat_ord

['LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu']

In [74]:
housePrice.groupby('GarageType')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('GarageType')
len(hp_cat_nom)

22

In [75]:
housePrice.groupby('GarageFinish')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('GarageFinish')
scale = {"none":0, "Unf":1, "RFn": 2, "Fin":3}
housePrice['GarageFinish_Enc'] = housePrice['GarageFinish'].replace(scale)
housePrice = housePrice.drop(columns='GarageFinish')
len(hp_cat_ord)

13

In [76]:
housePrice.groupby('GarageQual')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('GarageQual')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['GarageQual_Enc'] = housePrice['GarageQual'].replace(scale)
housePrice = housePrice.drop(columns='GarageQual')
len(hp_cat_ord)

14

In [77]:
housePrice.groupby('GarageCond')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('GarageCond')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['GarageCond_Enc'] = housePrice['GarageCond'].replace(scale)
housePrice = housePrice.drop(columns='GarageCond')
len(hp_cat_ord)

15

In [78]:
housePrice.groupby('PavedDrive')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('PavedDrive')
scale = {"N":0, "P":1, "Y": 2}
housePrice['PavedDrive_Enc'] = housePrice['PavedDrive'].replace(scale)
housePrice = housePrice.drop(columns='PavedDrive')
len(hp_cat_ord)

16

In [79]:
housePrice.groupby('PoolQC')['SalePrice'].agg(['count','mean','std'])
hp_cat_ord.append('PoolQC')
scale = {"none":0, "TA":1, "Po": 2, "Fa":3, "Gd":4, "Ex":5}
housePrice['PoolQC_Enc'] = housePrice['PoolQC'].replace(scale)
housePrice = housePrice.drop(columns='PoolQC')
len(hp_cat_ord)

17

In [80]:
housePrice.groupby('Fence')['SalePrice'].agg(['count','mean','std'])   ### measuring two completely different things
dropped.append('Fence')
housePrice = housePrice.drop(columns='Fence')                                           ## DROP

In [81]:
housePrice.groupby('SaleType')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('SaleType')
len(hp_cat_nom)

23

In [82]:
housePrice.groupby('SaleCondition')['SalePrice'].agg(['count','mean','std'])
hp_cat_nom.append('SaleCondition')
len(hp_cat_nom)

24

In [83]:
# see if ordinal/nominal add up
print("ordinal", len(hp_cat_ord))
print("nominal", len(hp_cat_nom))
print("categorical total", hp_cat.shape[1])

ordinal 17
nominal 24
categorical total 42


In [84]:
hp_cat.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'SaleType',
       'SaleCondition'],
      dtype='object')

#### OneHot Encoding for nominals

In [85]:
hp_num2 = housePrice.select_dtypes(include = [np.number])
hp_cat2 = housePrice.select_dtypes(exclude = [np.number])
print("numberfied:", hp_num2.shape[1])
print("to OneHot:", hp_cat2.shape[1])

numberfied: 54
to OneHot: 24


In [86]:
housePrice = pd.get_dummies(housePrice, columns=hp_cat_nom, drop_first=True)

In [87]:
housePrice.to_csv('HousePrice_Encoded.csv')