# Basic

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# feature engine
from feature_engine.encoding import CountFrequencyEncoder, OrdinalEncoder, RareLabelEncoder

In [2]:
# load dataset

data = pd.read_csv(
    "../../houseprice.csv",
#     usecols=["Neighborhood", "Exterior1st", "Exterior2nd", "SalePrice"],
)

data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# let's have a look at how many labels each variable has

for col in data.columns:
    print(col, ": ", len(data[col].unique()), " labels")

Id :  1460  labels
MSSubClass :  15  labels
MSZoning :  5  labels
LotFrontage :  111  labels
LotArea :  1073  labels
Street :  2  labels
Alley :  3  labels
LotShape :  4  labels
LandContour :  4  labels
Utilities :  2  labels
LotConfig :  5  labels
LandSlope :  3  labels
Neighborhood :  25  labels
Condition1 :  9  labels
Condition2 :  8  labels
BldgType :  5  labels
HouseStyle :  8  labels
OverallQual :  10  labels
OverallCond :  9  labels
YearBuilt :  112  labels
YearRemodAdd :  61  labels
RoofStyle :  6  labels
RoofMatl :  8  labels
Exterior1st :  15  labels
Exterior2nd :  16  labels
MasVnrType :  5  labels
MasVnrArea :  328  labels
ExterQual :  4  labels
ExterCond :  5  labels
Foundation :  6  labels
BsmtQual :  5  labels
BsmtCond :  5  labels
BsmtExposure :  5  labels
BsmtFinType1 :  7  labels
BsmtFinSF1 :  637  labels
BsmtFinType2 :  7  labels
BsmtFinSF2 :  144  labels
BsmtUnfSF :  780  labels
TotalBsmtSF :  721  labels
Heating :  6  labels
HeatingQC :  5  labels
CentralAir :  2  

In [4]:
data['Neighborhood'].unique()

array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
       'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
       'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
       'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
       'Blueste'], dtype=object)

In [5]:
data['Exterior1st'].unique()

array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
       'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
       'Stone', 'ImStucc', 'CBlock'], dtype=object)

In [6]:
data['Exterior2nd'].unique()

array(['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng',
       'CmentBd', 'BrkFace', 'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc',
       'AsphShn', 'Stone', 'Other', 'CBlock'], dtype=object)

In [7]:
target  = 'SalePrice'
X = data.drop(labels=[target], axis=1)
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=0
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1022, 80)
X_test shape: (438, 80)
y_train shape: (1022,)
y_test shape: (438,)


In [8]:
X_train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      189
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

## Ordinal Encoding (sklearn)

In [9]:
# cat_vars = list(X_train.select_dtypes(include='O'))
# cat_vars

In [10]:
# encoder = OrdinalEncoder()

In [11]:
# ct = ColumnTransformer(
#     [
#         ('ordinal_encoder', encoder, cat_vars)
#     ], 
#     remainder='passthrough'
# )

# ct.set_output(transform='pandas')

In [12]:
# ct.fit(X_train)

In [13]:
# ct.named_transformers_['ordinal_encoder'].categories_

In [14]:
# # transform data
# X_train_enc = ct.transform(X_train)
# X_test_enc = ct.transform(X_test)

# X_train_enc.head()

## Count or Frequency Encoding (Feature Engine)

In [15]:
# count_enc = CountFrequencyEncoder(
#     encoding_method='count', # to do frequency ==> encoding_method='frequency'
#     variables=["Neighborhood", "Exterior1st", "Exterior2nd"],
# )

# count_enc.fit(X_train)

In [16]:
# # in the encoder dict we can observe the number of 
# # observations per category for each variable

# count_enc.encoder_dict_

In [17]:
# X_train = count_enc.transform(X_train)
# X_test = count_enc.transform(X_test)

# X_train.head()

# Monotonic

## Ordered Integer Encoding

In [18]:
# # Let's explore the relationship of the categories with the target

# for var in ["Neighborhood", "Exterior1st", "Exterior2nd"]:
    
#     fig, axes = plt.subplots()
#     axes = y_train.groupby(X_train[var]).mean().plot()
#     axes.set_title(f'Relationship between {var} and SalePrice')
#     axes.set_ylabel('Mean SalePrice')
#     plt.tight_layout()
#     plt.show()
#     print()

In [19]:
# ordinal_enc = OrdinalEncoder(
#     encoding_method = 'ordered',
#     variables = ["Neighborhood", "Exterior1st", "Exterior2nd"],
# )

In [20]:
# ordinal_enc.fit(X_train, y_train)

In [21]:
# # in the encoder dict we can observe each of the top categories
# # selected for each of the variables

# ordinal_enc.encoder_dict_

In [22]:
# # this is the list of variables that the encoder will transform

# ordinal_enc.variables_

In [23]:
# X_train = ordinal_enc.transform(X_train)
# X_test = ordinal_enc.transform(X_test)

In [24]:
# X_train.head()

In [25]:
# # let's inspect the newly created monotonic relationship
# # between the variables and the target

# for var in ["Neighborhood", "Exterior1st", "Exterior2nd"]:

#     fig, axes = plt.subplots()
#     axes = y_train.groupby(X_train[var]).mean().plot()
#     axes.set_title("Monotonic relationship between {} and SalePrice".format(var))
#     axes.set_ylabel("Mean SalePrice")
#     plt.tight_layout()
#     plt.show()
#     print()    

In [26]:
# # let's inspect the newly created monotonic relationship
# # between the variables and the target

# for var in ["Neighborhood", "Exterior1st", "Exterior2nd"]:

#     fig, axes = plt.subplots()
#     axes = y_test.groupby(X_test[var]).mean().plot()
#     axes.set_title("Monotonic relationship between {} and SalePrice".format(var))
#     axes.set_ylabel("Mean SalePrice")
#     plt.tight_layout()
#     plt.show()
#     print()    

# Rare Labels

In [27]:
# # Let's visualize the most frequent neighbourhoods

# X_train["Neighborhood"].value_counts().sort_values(ascending=False).head(5)

## OHE with Top Categories

In [28]:
# ohe_enc = OneHotEncoder(
#     handle_unknown='infrequent_if_exist', # unseen categories will be treated like the less frequent ones
#     max_categories = 5, # the number of top categories
#     sparse_output = False # necessary for set output pandas
# )

# ohe_enc.set_output(transform='pandas')

In [29]:
# ohe_enc.fit(X_train)

In [30]:
# ohe_enc.infrequent_categories_

In [31]:
# # the categories found in each variable
# ohe_enc.categories_

In [32]:
# # encode
# X_train = ohe_enc.transform(X_train)
# X_test = ohe_enc.transform(X_test)

In [33]:
# # let's explore the result
# X_train.head()

## Encoding Rare Labels (feature-engine)

In [34]:
# Rare value encoder

rare_encoder = RareLabelEncoder(
    tol=0.05, # minimal percentage to be considered non-rare
    n_categories=4, # minimal number of categories the variable should have to re-group rare categories
    variables=[
        "Neighborhood",
        "Exterior1st",
        "Exterior2nd",
        "MasVnrType",
        "ExterQual",
        "BsmtCond",
    ],  # variables to re-group
)

In [35]:
rare_encoder.fit(X_train.fillna('Missing'))



In [36]:
rare_encoder.variables_

['Neighborhood',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'BsmtCond']

In [37]:
# the encoder_dict_ is a dictionary of variable: frequent labels pair
rare_encoder.encoder_dict_

{'Neighborhood': ['NAmes',
  'CollgCr',
  'OldTown',
  'Edwards',
  'Sawyer',
  'Somerst',
  'Gilbert'],
 'Exterior1st': ['VinylSd', 'HdBoard', 'Wd Sdng', 'MetalSd', 'Plywood'],
 'Exterior2nd': ['VinylSd', 'Wd Sdng', 'HdBoard', 'MetalSd', 'Plywood'],
 'MasVnrType': ['None', 'BrkFace', 'Stone'],
 'ExterQual': array(['TA', 'Gd', 'Ex', 'Fa'], dtype=object),
 'BsmtCond': ['TA']}

In [38]:
X_train = rare_encoder.transform(X_train.fillna("Missing"))
X_test = rare_encoder.transform(X_test.fillna("Missing"))