### Weight of Evidence with Feature Engine

In [101]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.encoding import WoEEncoder as fe_WoEEncoder

In [103]:
ds = pd.read_csv('train.csv',usecols=[ 'Sex','Cabin', 'Embarked', 'Survived'])

In [104]:
ds.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,,S
1,1,female,C85,C
2,1,female,,S
3,1,female,C123,S
4,0,male,,S


In [105]:
# lets replace NaN values for Cabin and Embarked with label 'Missing'
ds['Cabin'] = ds['Cabin'].fillna('Missing')
ds['Embarked'] = ds['Embarked'].fillna('Missing')

In [106]:
ds.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,Missing,S
1,1,female,C85,C
2,1,female,Missing,S
3,1,female,C123,S
4,0,male,Missing,S


In [107]:
# Now we extract the first letter of the cabin
# to create a simpler variable for practice

ds['Cabin'] = ds['Cabin'].astype(str).str[0]

In [108]:
# let's remove the observations where Cabin = T as there are very few

ds = ds[ds['Cabin']!= 'T']
ds.shape

(890, 4)

In [109]:
# Lets have a look at number of unique categories for each feature
for column in ds.columns:
  print(f"column {column} has {len(ds[column].unique())} unique categories")

column Survived has 2 unique categories
column Sex has 2 unique categories
column Cabin has 8 unique categories
column Embarked has 4 unique categories


In [110]:
# let's have a look at unique labels
ds['Sex'].unique()

array(['male', 'female'], dtype=object)

In [111]:
ds['Embarked'].unique()

array(['S', 'C', 'Q', 'Missing'], dtype=object)

In [112]:
ds['Cabin'].unique()
# note that M is for Missing

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

### Note: 
In WoE with feature engine, we do not need to keep the target variable in the training dataset.

In [113]:
# Splitting the train and test set
X_train, X_test, y_train, y_test = train_test_split(
    ds[['Cabin', 'Sex', 'Embarked']],
    ds['Survived'],  
    test_size=0.3,  
    random_state=0) 

# print the shape
X_train.shape, X_test.shape

((623, 3), (267, 3))

In [117]:
# Create the model 
woe_enc = fe_WoEEncoder(variables=['Cabin', 'Sex'])

In [118]:
# Fit the model

woe_enc.fit(X_train, y_train)

WoEEncoder(variables=['Cabin', 'Sex'])

In [120]:
# let's observe te mean target value assigned to eac category 
woe_enc.encoder_dict_

{'Cabin': {'A': 0.25934659997817017,
  'B': 1.4633194043041062,
  'C': 0.7420013467774647,
  'D': 1.3579588886462801,
  'E': 1.4633194043041062,
  'F': 0.9731130677408513,
  'G': 0.4134972798054287,
  'M': -0.3499663991719764},
 'Sex': {'female': 1.5906998430667598, 'male': -1.0303209254526486}}

In [123]:
# print the variable's which the encoders will transform
woe_enc.variables_

['Cabin', 'Sex']

In [122]:
# Transform and print the result
X_train = woe_enc.transform(X_train)
X_test = woe_enc.transform(X_test)

In [124]:
# let's view the result

X_train.head(10)

Unnamed: 0,Cabin,Sex,Embarked
64,-0.349966,-1.030321,C
709,-0.349966,-1.030321,C
52,1.357959,1.5907,C
387,-0.349966,1.5907,S
124,1.357959,-1.030321,S
579,-0.349966,-1.030321,S
550,0.742001,-1.030321,C
118,1.463319,-1.030321,C
12,-0.349966,-1.030321,S
157,-0.349966,-1.030321,S


### Note
1. If the argument variables is left to None, then the encoder will automatically identify all categorical variables.

2. The encoder will not encode numerical variables. So if some of your numerical variables are in fact categories, you will need to re-cast them as object before using the encoder.

3. If there is a label in the test set that was not present in the train set, the encoder will through and error, to alert you of this behaviour.

4. If any of the terms in the weight of evidence calculation is 0, the log of 0 is not defined, so this transformer will raise an error.
