In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder 


In [2]:
data = pd.read_csv("../data/no_processed/train_cat.csv")
data.head()
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302 entries, 0 to 2301
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2302 non-null   int64  
 1   MSSubClass     2302 non-null   int64  
 2   MSZoning       2298 non-null   object 
 3   LotFrontage    1915 non-null   float64
 4   LotArea        2302 non-null   int64  
 5   Street         2302 non-null   object 
 6   Alley          148 non-null    object 
 7   LotShape       2302 non-null   object 
 8   LandContour    2302 non-null   object 
 9   Utilities      2301 non-null   object 
 10  LotConfig      2302 non-null   object 
 11  LandSlope      2302 non-null   object 
 12  Neighborhood   2302 non-null   object 
 13  Condition1     2302 non-null   object 
 14  Condition2     2302 non-null   object 
 15  BldgType       2302 non-null   object 
 16  HouseStyle     2302 non-null   object 
 17  OverallQual    2302 non-null   int64  
 18  OverallC

# Categories

1. Ordinal
   1. When the variable is relevant to the target variable
2. Nominal
   1. When the variable is not relevant to the target variable

In [3]:
# ExterQual column
data.ExterQual.value_counts()
# 1. Fa
# 2. TA 
# 3. Gd
# 4. Ex


ExterQual
TA    1437
Gd     757
Ex      85
Fa      23
Name: count, dtype: int64

In [4]:
# ExterCond column
data.ExterCond.value_counts()

ExterCond
TA    2029
Gd     218
Fa      42
Ex      10
Po       3
Name: count, dtype: int64

In [5]:
# SaleCondition column

data.SaleCondition.value_counts()

SaleCondition
Normal     1899
Partial     196
Abnorml     142
Family       37
Alloca       18
AdjLand      10
Name: count, dtype: int64

In [6]:
# MSSubClass column, this columns is nominal because the values of the columns is integers, and be care to not identify as
# ordinal

data.MSSubClass.value_counts()

MSSubClass
20     865
60     443
50     223
120    144
30     112
70     103
160    102
80      95
90      85
190     44
85      32
75      19
45      15
180     14
40       5
150      1
Name: count, dtype: int64

In [7]:
cat_ord = ['ExterQual', 'ExterCond']
cat_nom = ['SaleCondition', 'MSSubClass']

data_test = data.copy()

In [8]:
label_enc = LabelEncoder()
data_test['SaleCondition'] = label_enc.fit_transform(data['SaleCondition'])
data_test.SaleCondition.value_counts()

SaleCondition
4    1899
5     196
0     142
3      37
2      18
1      10
Name: count, dtype: int64

In [9]:
ord_enc = OrdinalEncoder() # Any variable, but should be ordinal variable
data_test[cat_ord] = ord_enc.fit_transform(data_test[cat_ord])
data_test.ExterQual.value_counts(),
data_test.ExterCond.value_counts()

ExterCond
4.0    2029
2.0     218
1.0      42
0.0      10
3.0       3
Name: count, dtype: int64

In [10]:
ord_enc_man = {
  'Ex': 4,
  'Gd': 3,
  'TA': 2,
  'Fa': 1,
}

data_test = data.copy()
data_test['ExterQual'] = data_test['ExterQual'].map(ord_enc_man)
data_test.ExterQual.value_counts()

ExterQual
2    1437
3     757
4      85
1      23
Name: count, dtype: int64

In [11]:
# Hot encoder
data_test = data.copy()
res_dummies = pd.get_dummies(data_test[cat_nom]).astype(int)
print(res_dummies)
data_test = pd.concat([data_test.drop(cat_nom, axis = 1), res_dummies], axis = 1)
data_test.head()

      MSSubClass  SaleCondition_Abnorml  SaleCondition_AdjLand  \
0             20                      0                      0   
1             90                      0                      0   
2             80                      0                      0   
3             20                      0                      0   
4             30                      1                      0   
...          ...                    ...                    ...   
2297          20                      0                      0   
2298          60                      0                      0   
2299          30                      0                      0   
2300          80                      0                      0   
2301          80                      0                      0   

      SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \
0                        0                     0                     1   
1                        0                     0           

Unnamed: 0,Id,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,YrSold,SaleType,SalePrice,MSSubClass,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,RL,64.0,10475,Pave,,IR1,Lvl,AllPub,Corner,...,2010,WD,245350.0,20,0,0,0,0,1,0
1,2,RL,,18890,Pave,,IR1,Lvl,AllPub,Inside,...,2007,WD,190000.0,90,0,0,0,0,1,0
2,3,RL,,21453,Pave,,IR1,Low,AllPub,CulDSac,...,2006,WD,180000.0,80,0,0,0,0,1,0
3,4,RL,60.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,2010,WD,128000.0,20,0,0,0,0,1,0
4,5,RM,60.0,8967,Pave,,Reg,Lvl,AllPub,Corner,...,2007,WD,67000.0,30,1,0,0,0,0,0


In [12]:
dic_msubclass = data_test.groupby('MSSubClass').agg({
  'SalePrice': 'median'
}).to_dict()['SalePrice']

data_test['MSSubClass'] = data_test['MSSubClass'].map(dic_msubclass)
data_test['MSSubClass'].value_counts()

MSSubClass
174784.28070    865
228500.00000    443
141044.70850    223
209874.01885    144
109950.00000    112
161000.00000    103
148500.00000    102
172230.98870     95
144000.00000     85
140244.94645     44
161850.25525     32
179500.00000     19
131468.31290     15
98033.06465      14
133000.00000      5
166769.62900      1
Name: count, dtype: int64

# Median (Average)
# example: [1, 2, 3, 4, 5] -> 3
# example: [1, 2, 3, 4, 5, 6] -> (3 + 4) / 2 = 3.5

# Mean
# example: [1, 2, 3, 4, 5] -> (1 + 2 + 3 + 4 + 5) / 5 = 3
