In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn import preprocessing 
# from sklearn.preprocessing import CategoricalEncoder
from category_encoders import BinaryEncoder

In [2]:
tdata = pd.read_csv("train.csv")
print(tdata.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [3]:
print(tdata["Fence"])
print(tdata["Fence"].value_counts())

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1455      NaN
1456    MnPrv
1457    GdPrv
1458      NaN
1459      NaN
Name: Fence, Length: 1460, dtype: object
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64


In [4]:
Fence_dict = {"GdPrv":4,"MnPrv":3,"GdWo":2,"MnWw":1,"NA":0}
tdata["Fence"] = tdata["Fence"].map(Fence_dict).fillna(0).astype(int)
print(tdata["Fence"])
print(tdata["Fence"].value_counts())
print(np.mean(tdata["Fence"]))

0       0
1       0
2       0
3       0
4       0
       ..
1455    0
1456    3
1457    4
1458    0
1459    0
Name: Fence, Length: 1460, dtype: int32
0    1179
3     157
4      59
2      54
1      11
Name: Fence, dtype: int64
0.5657534246575342


Ordinal Categories: OverallQual, OverallCond, ExterQual, ExterCond, BsmtQual **, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond, PavedDrive **, PoolQC, Fence

OverallQual and OverallCond are already on a numbered scale, all the others are not

In [5]:
Excellent_to_poor_dict = {"Ex": 5, "Gd": 4, "TA":3, "Fa":2, "Po":1}
Na_to_excellent_dict = {"Ex": 5, "Gd": 4, "Ta":3, "Fa":2, "Po":1, "NA":0}
Na_to_gd_dict = {"Gd": 4, "Av": 3, "Mn":2, "No":1, "NA":0}
Bsmt_finish_dict = {"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1,"NA":0}
Functionality_dict = {"Typ":8,"Min1":7,"Min2":6,"Mod":5,"Maj1":4,"Maj2":3,"Sev":2,"Sal":1} # should salvage be 0 or 1
Garage_fin_dict = {"Fin":3,"RFn":2,"Unf":1,"NA":0}
Na_to_fa_ex_dict = {"Ex": 4, "Gd": 3, "Ta":2, "Fa":1, "NA":0}
Fence_dict = {"GdPrv":4,"MnPrv":3,"GdWo":2,"MnWw":1,"NA":0}

tdata["ExterQual"] = tdata["ExterQual"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["ExterCond"] = tdata["ExterCond"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["BsmtQual"] = tdata["BsmtQual"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["BsmtCond"] = tdata["BsmtCond"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["BsmtExposure"] = tdata["BsmtExposure"].map(Na_to_gd_dict) # check this way of doing it. There is a no and an NA which might be problematic
tdata["BsmtFinType1"] = tdata["BsmtFinType1"].map(Bsmt_finish_dict).fillna(0).astype(int)
tdata["BsmtFinType2"] = tdata["BsmtFinType2"].map(Bsmt_finish_dict).fillna(0).astype(int)
tdata["HeatingQC"] = tdata["HeatingQC"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["KitchenQual"] = tdata["KitchenQual"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["Functional"] = tdata["Functional"].map(Functionality_dict).fillna(0).astype(int)
tdata["FireplaceQu"] = tdata["FireplaceQu"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["GarageFinish"] = tdata["GarageFinish"].map(Garage_fin_dict).fillna(0).astype(int)
tdata["GarageQual"] = tdata["GarageQual"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["GarageCond"] = tdata["GarageCond"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["PoolQC"] = tdata["PoolQC"].map(Na_to_fa_ex_dict).fillna(0).astype(int)
tdata["Fence"] = tdata["Fence"].map(Fence_dict).fillna(0).astype(int)


Nominal Categories: MSSubClass, MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, RoofStyle, RoofMatl

Binary Encode
"MSSubClass", "MSZoning", "Neighborhood","Condition1","Condition2", "HouseStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "SaleType"
May have to come back later and redo my general rule of thumb was more than 6 to binary encode.
It may be good to binary and one hot encode these and observe the differences. Binary encoding can lead to reduced interperability.

In [7]:
Binary_encode_list = ["MSSubClass", "MSZoning", "Neighborhood","Condition1","Condition2", "HouseStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "SaleType"]
Binary_encode_list_df = BinaryEncoder(Binary_encode_list).fit_transform(tdata)

print(Binary_encode_list_df.head())
# CHECK WORK HERE TOMORROW

   Id  MSSubClass  MSZoning_0  MSZoning_1  MSZoning_2  LotFrontage  LotArea  \
0   1          60           0           0           1         65.0     8450   
1   2          20           0           0           1         80.0     9600   
2   3          60           0           0           1         68.0    11250   
3   4          70           0           0           1         60.0     9550   
4   5          60           0           0           1         84.0    14260   

   Street_0  Street_1  Alley_0  ...  MoSold  YrSold  SaleType_0  SaleType_1  \
0         0         1        1  ...       2    2008           0           0   
1         0         1        1  ...       5    2007           0           0   
2         0         1        1  ...       9    2008           0           0   
3         0         1        1  ...       2    2006           0           0   
4         0         1        1  ...      12    2008           0           0   

   SaleType_2  SaleType_3  SaleCondition_0  SaleCo

In [13]:
print(tdata.columns)
print(Binary_encode_list_df.columns)
print(len(tdata.rows))
print(len(Binary_encode_list_df.rows))

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

AttributeError: 'DataFrame' object has no attribute 'rows'

One Hot Encoded:

Binary Encoded: Neighborhood, BldgType, HouseStyle, RoofStyle, RoofMatl

Boolian Encoded:

Numerical Catecories: LotFrontage, LotArea, 

Time Related Categories: YearBuilt, YearRemodAdd, 