In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn import preprocessing 
# from sklearn.preprocessing import CategoricalEncoder
from category_encoders import BinaryEncoder

In [10]:
tdata = pd.read_csv("train.csv")
print(tdata.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [11]:
print(tdata["Fence"])
print(tdata["Fence"].value_counts())

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1455      NaN
1456    MnPrv
1457    GdPrv
1458      NaN
1459      NaN
Name: Fence, Length: 1460, dtype: object
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64


In [12]:
Fence_dict = {"GdPrv":4,"MnPrv":3,"GdWo":2,"MnWw":1,"NA":0}
tdata["Fence"] = tdata["Fence"].map(Fence_dict).fillna(0).astype(int)
print(tdata["Fence"])
print(tdata["Fence"].value_counts())
print(np.mean(tdata["Fence"]))

0       0
1       0
2       0
3       0
4       0
       ..
1455    0
1456    3
1457    4
1458    0
1459    0
Name: Fence, Length: 1460, dtype: int32
0    1179
3     157
4      59
2      54
1      11
Name: Fence, dtype: int64
0.5657534246575342


Ordinal Categories: OverallQual, OverallCond, ExterQual, ExterCond, BsmtQual **, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond, PavedDrive **, PoolQC, Fence

Land contour, utilities, LandSlope

OverallQual and OverallCond are already on a numbered scale, all the others are not

In [13]:
Excellent_to_poor_dict = {"Ex": 5, "Gd": 4, "TA":3, "Fa":2, "Po":1}
Na_to_excellent_dict = {"Ex": 5, "Gd": 4, "Ta":3, "Fa":2, "Po":1, "NA":0}
Na_to_gd_dict = {"Gd": 4, "Av": 3, "Mn":2, "No":1, "NA":0}
Bsmt_finish_dict = {"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1,"NA":0}
Functionality_dict = {"Typ":8,"Min1":7,"Min2":6,"Mod":5,"Maj1":4,"Maj2":3,"Sev":2,"Sal":1} # should salvage be 0 or 1
Garage_fin_dict = {"Fin":3,"RFn":2,"Unf":1,"NA":0}
Na_to_fa_ex_dict = {"Ex": 4, "Gd": 3, "Ta":2, "Fa":1, "NA":0}
Fence_dict = {"GdPrv":4,"MnPrv":3,"GdWo":2,"MnWw":1,"NA":0}

tdata["ExterQual"] = tdata["ExterQual"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["ExterCond"] = tdata["ExterCond"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["BsmtQual"] = tdata["BsmtQual"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["BsmtCond"] = tdata["BsmtCond"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["BsmtExposure"] = tdata["BsmtExposure"].map(Na_to_gd_dict) # check this way of doing it. There is a no and an NA which might be problematic
tdata["BsmtFinType1"] = tdata["BsmtFinType1"].map(Bsmt_finish_dict).fillna(0).astype(int)
tdata["BsmtFinType2"] = tdata["BsmtFinType2"].map(Bsmt_finish_dict).fillna(0).astype(int)
tdata["HeatingQC"] = tdata["HeatingQC"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["KitchenQual"] = tdata["KitchenQual"].map(Excellent_to_poor_dict).fillna(0).astype(int)
tdata["Functional"] = tdata["Functional"].map(Functionality_dict).fillna(0).astype(int)
tdata["FireplaceQu"] = tdata["FireplaceQu"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["GarageFinish"] = tdata["GarageFinish"].map(Garage_fin_dict).fillna(0).astype(int)
tdata["GarageQual"] = tdata["GarageQual"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["GarageCond"] = tdata["GarageCond"].map(Na_to_excellent_dict).fillna(0).astype(int)
tdata["PoolQC"] = tdata["PoolQC"].map(Na_to_fa_ex_dict).fillna(0).astype(int)
tdata["Fence"] = tdata["Fence"].map(Fence_dict).fillna(0).astype(int)


In [14]:
print(tdata["LandContour"])
Land_contour_dict = {"Lvl":4, "Bnk":3, "HLS":2, "Low": 1} #think about whether low should be 1 or 0
tdata["LandContour"] = tdata["LandContour"].map(Land_contour_dict).fillna(0).astype(int)
print(tdata["LandContour"])

Utilities_dict = {"AllPub":4,"NoSewr":3,"NoSeWa":2,"ELO":1}
tdata["Utilities"] = tdata["Utilities"].map(Utilities_dict).fillna(0).astype(int)

slope_dict = {"Gtl":3,"Mod":2,"Sev":1}
tdata["LandSlope"] = tdata["LandSlope"].map(slope_dict).fillna(0).astype(int)

0       Lvl
1       Lvl
2       Lvl
3       Lvl
4       Lvl
       ... 
1455    Lvl
1456    Lvl
1457    Lvl
1458    Lvl
1459    Lvl
Name: LandContour, Length: 1460, dtype: object
0       4
1       4
2       4
3       4
4       4
       ..
1455    4
1456    4
1457    4
1458    4
1459    4
Name: LandContour, Length: 1460, dtype: int32


Nominal Categories: MSSubClass, MSZoning, Street, Alley, LotShape, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, RoofStyle, RoofMatl

Binary Encode
"MSSubClass", "MSZoning", "Neighborhood","Condition1","Condition2", "HouseStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "SaleType"
May have to come back later and redo my general rule of thumb was more than 6 to binary encode.
It may be good to binary and one hot encode these and observe the differences. Binary encoding can lead to reduced interperability.

In [15]:
Binary_encode_list = ["MSSubClass", "MSZoning", "Neighborhood","Condition1","Condition2", "HouseStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "SaleType"]
tdata = BinaryEncoder(cols=Binary_encode_list).fit_transform(tdata)

print(tdata.head())
# CHECK WORK HERE TOMORROW

   Id  MSSubClass_0  MSSubClass_1  MSSubClass_2  MSSubClass_3  MSZoning_0  \
0   1             0             0             0             1           0   
1   2             0             0             1             0           0   
2   3             0             0             0             1           0   
3   4             0             0             1             1           0   
4   5             0             0             0             1           0   

   MSZoning_1  MSZoning_2  LotFrontage  LotArea  ... MiscFeature MiscVal  \
0           0           1         65.0     8450  ...         NaN       0   
1           0           1         80.0     9600  ...         NaN       0   
2           0           1         68.0    11250  ...         NaN       0   
3           0           1         60.0     9550  ...         NaN       0   
4           0           1         84.0    14260  ...         NaN       0   

  MoSold  YrSold  SaleType_0 SaleType_1  SaleType_2  SaleType_3  \
0      2    2

One Hot Encoded: "Street", "Alley", "LotShape", "LotConfig", "BldgType", "RoofStyle", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType", "PavedDrive", "MiscFeature", "SaleCondition"

Figure out what is wrong here

In [19]:
one_hot_encode_list = ["Street", "Alley", "LotShape", "LotConfig", "BldgType", "RoofStyle", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType", "PavedDrive", "MiscFeature", "SaleCondition"]
num = 1
for item in one_hot_encode_list:
    try:
        one_hot_df = pd.get_dummies(tdata[item], prefix="dummy"+str(num), dtype=int)
        tdata = tdata.join(one_hot_df)
        num+=1
    except ValueError:
        print(item + " has repeated values as a possible state")


Street has repeated values as a possible state
Alley has repeated values as a possible state
LotShape has repeated values as a possible state
LotConfig has repeated values as a possible state
BldgType has repeated values as a possible state
RoofStyle has repeated values as a possible state
MasVnrType has repeated values as a possible state
Foundation has repeated values as a possible state
Heating has repeated values as a possible state
Electrical has repeated values as a possible state
GarageType has repeated values as a possible state
PavedDrive has repeated values as a possible state
MiscFeature has repeated values as a possible state
SaleCondition has repeated values as a possible state


In [None]:
print(tdata["dummy_Grvl"])

Binary Encoded: Neighborhood, BldgType, HouseStyle, RoofStyle, RoofMatl

Boolian Encoded:

Numerical Catecories: LotFrontage, LotArea, 

Time Related Categories: YearBuilt, YearRemodAdd, 