In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
from xverse.transformer import MonotonicBinning, WOE

In [3]:
df = pd.read_csv('df_EDA.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,G,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,G,S


In [4]:
df = df.drop(['Name','Ticket'],1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  890 non-null    int64  
 1   Survived     890 non-null    int64  
 2   Pclass       890 non-null    int64  
 3   Sex          890 non-null    object 
 4   Age          890 non-null    float64
 5   SibSp        890 non-null    int64  
 6   Parch        890 non-null    int64  
 7   Fare         890 non-null    float64
 8   Cabin        890 non-null    object 
 9   Embarked     890 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


In [6]:
y = df['Survived']
X = df.drop(['Survived', 'PassengerId'], 1)

In [7]:
print("Survived:", sum(y))
print("Total:", len(y))

Survived: 342
Total: 890


In [53]:
clf = MonotonicBinning()
clf.fit(X,y)

In [54]:
print(clf.bins)

{'Pclass': array([1., 2., 3.]), 'Age': array([ 0.42      , 23.81818182, 33.        , 80.        ]), 'SibSp': array([0., 1., 8.]), 'Parch': array([0., 1., 6.]), 'Fare': array([  0.    ,   8.6625,  26.    , 512.3292])}


In [25]:
df['Age'].min()

0.42

In [27]:
custom_bins = {
    'Age': range(int(df['Age'].min()), int(df['Age'].max()), 10)
}

WOE

In [28]:
clf = WOE(mono_custom_binning=custom_bins)
clf.fit(X,y)

In [29]:
clf.woe_bins

{'Pclass': {1: 1.0146715831544408,
  2: 0.3626616900366113,
  3: -0.6683058112762066},
 'Sex': {'female': 1.5280538487786721, 'male': -0.9835168255543433},
 'Age': {Interval(-0.001, 10.0, closed='right'): 0.8509541715903142,
  Interval(10.0, 20.0, closed='right'): 0.47146454988541026,
  Interval(20.0, 30.0, closed='right'): -0.4880873046790309,
  Interval(30.0, 40.0, closed='right'): 0.3577056645287524,
  Interval(40.0, 50.0, closed='right'): -0.25153559382421614,
  Interval(50.0, 60.0, closed='right'): 0.0858020690734256,
  Interval(60.0, 70.0, closed='right'): -0.7071904464562357,
  'NA': -0.9148298112344803},
 'SibSp': {Interval(-0.001, 1.0, closed='right'): 0.04348057824189633,
  Interval(1.0, 8.0, closed='right'): -0.5217872231248731},
 'Parch': {Interval(-0.001, 1.0, closed='right'): -0.04003198970581386,
  Interval(1.0, 6.0, closed='right'): 0.3238285510793457},
 'Fare': {Interval(-0.001, 8.662, closed='right'): -0.9270499225692556,
  Interval(8.662, 26.0, closed='right'): 0.077

In [30]:
clf.mono_custom_binning

{'Age': range(0, 80, 10),
 'Parch': array([0., 1., 6.]),
 'Fare': array([  0.    ,   8.6625,  26.    , 512.3292]),
 'Pclass': array([1., 2., 3.]),
 'SibSp': array([0., 1., 8.])}

In [31]:
df_woe = clf.woe_df
df_woe

Unnamed: 0,Variable_Name,Category,Count,Event,Non_Event,Event_Rate,Non_Event_Rate,Event_Distribution,Non_Event_Distribution,WOE,Information_Value
0,Pclass,1,215,136,79,0.632558,0.367442,0.397661,0.144161,1.014672,0.50641
1,Pclass,2,184,87,97,0.472826,0.527174,0.254386,0.177007,0.362662,0.50641
2,Pclass,3,491,119,372,0.242363,0.757637,0.347953,0.678832,-0.668306,0.50641
3,Sex,female,314,233,81,0.742038,0.257962,0.681287,0.14781,1.528054,1.339864
4,Sex,male,576,109,467,0.189236,0.810764,0.318713,0.85219,-0.983517,1.339864
5,Age,"(-0.001, 10.0]",64,38,26,0.59375,0.40625,0.111111,0.047445,0.850954,0.222182
6,Age,"(10.0, 20.0]",142,71,71,0.5,0.5,0.207602,0.129562,0.471465,0.222182
7,Age,"(20.0, 30.0]",343,95,248,0.276968,0.723032,0.277778,0.452555,-0.488087,0.222182
8,Age,"(30.0, 40.0]",176,83,93,0.471591,0.528409,0.24269,0.169708,0.357706,0.222182
9,Age,"(40.0, 50.0]",101,33,68,0.326733,0.673267,0.096491,0.124088,-0.251536,0.222182


In [60]:
# # Cabin IV > 0.5, so checking the feature
# sns.countplot(x=df['Cabin'], hue=y)
# plt.show()

In [61]:
df_woe_imp = df_woe[df_woe['Information_Value'].between(0.1,0.6)]
df_woe_imp

Unnamed: 0,Variable_Name,Category,Count,Event,Non_Event,Event_Rate,Non_Event_Rate,Event_Distribution,Non_Event_Distribution,WOE,Information_Value
0,Pclass,1,215,136,79,0.632558,0.367442,0.397661,0.144161,1.014672,0.50641
1,Pclass,2,184,87,97,0.472826,0.527174,0.254386,0.177007,0.362662,0.50641
2,Pclass,3,491,119,372,0.242363,0.757637,0.347953,0.678832,-0.668306,0.50641
12,Fare,"(-0.001, 8.662]",308,61,247,0.198052,0.801948,0.178363,0.45073,-0.92705,0.431747
13,Fare,"(8.662, 26.0]",288,116,172,0.402778,0.597222,0.339181,0.313869,0.07756,0.431747
14,Fare,"(26.0, 512.329]",294,165,129,0.561224,0.438776,0.482456,0.235401,0.717598,0.431747
22,Embarked,C,168,93,75,0.553571,0.446429,0.27193,0.136861,0.686576,0.119348
23,Embarked,Q,77,30,47,0.38961,0.61039,0.087719,0.085766,0.022514,0.119348
24,Embarked,S,645,219,426,0.339535,0.660465,0.640351,0.777372,-0.193903,0.119348


In [62]:
X2 = clf.transform(X[df_woe['Variable_Name'].unique()])
X2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,-0.668306,-0.983517,0.324534,0.043481,-0.040032,-0.927050,-0.699805,-0.193903
1,1.014672,1.528054,0.054371,0.043481,-0.040032,0.717598,0.653786,0.686576
2,-0.668306,1.528054,-0.405875,0.043481,-0.040032,-0.927050,-0.699805,-0.193903
3,1.014672,1.528054,0.054371,0.043481,-0.040032,0.717598,0.653786,-0.193903
4,-0.668306,-0.983517,0.054371,0.043481,-0.040032,-0.927050,-0.699805,-0.193903
...,...,...,...,...,...,...,...,...
885,0.362662,-0.983517,-0.405875,0.043481,-0.040032,0.077560,0.301566,-0.193903
886,1.014672,1.528054,0.324534,0.043481,-0.040032,0.717598,1.541906,-0.193903
887,-0.668306,1.528054,0.324534,0.043481,0.323829,0.077560,-0.699805,-0.193903
888,1.014672,-0.983517,-0.405875,0.043481,-0.040032,0.717598,0.653786,0.686576


In [63]:
woe_data = pd.concat([X2, y], 1)
woe_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,-0.668306,-0.983517,0.324534,0.043481,-0.040032,-0.927050,-0.699805,-0.193903,0
1,1.014672,1.528054,0.054371,0.043481,-0.040032,0.717598,0.653786,0.686576,1
2,-0.668306,1.528054,-0.405875,0.043481,-0.040032,-0.927050,-0.699805,-0.193903,1
3,1.014672,1.528054,0.054371,0.043481,-0.040032,0.717598,0.653786,-0.193903,1
4,-0.668306,-0.983517,0.054371,0.043481,-0.040032,-0.927050,-0.699805,-0.193903,0
...,...,...,...,...,...,...,...,...,...
885,0.362662,-0.983517,-0.405875,0.043481,-0.040032,0.077560,0.301566,-0.193903,0
886,1.014672,1.528054,0.324534,0.043481,-0.040032,0.717598,1.541906,-0.193903,1
887,-0.668306,1.528054,0.324534,0.043481,0.323829,0.077560,-0.699805,-0.193903,0
888,1.014672,-0.983517,-0.405875,0.043481,-0.040032,0.717598,0.653786,0.686576,1


In [64]:
woe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    890 non-null    float64
 1   Sex       890 non-null    float64
 2   Age       890 non-null    float64
 3   SibSp     890 non-null    float64
 4   Parch     890 non-null    float64
 5   Fare      890 non-null    float64
 6   Cabin     890 non-null    float64
 7   Embarked  890 non-null    float64
 8   Survived  890 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 62.7 KB


In [65]:
woe_data.to_csv('df_WOE.csv', index=False)

In [66]:
test_df = pd.read_csv('df_EDA_Test.csv')

In [67]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,E,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,E,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,F,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,E,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,E,S


In [68]:
#col = ['Sex', 'Age', 'SibSp', 'Cabin', 'Embarked']

In [69]:
Test_WOE = clf.transform(test_df)
Test_WOE.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,-0.668306,"Kelly, Mr. James",-0.983517,0.054371,0.043481,-0.040032,330911,-0.92705,1.610899,0.022514
1,893,-0.668306,"Wilkes, Mrs. James (Ellen Needs)",1.528054,0.054371,0.043481,-0.040032,363272,-0.92705,1.610899,-0.193903
2,894,0.362662,"Myles, Mr. Thomas Francis",-0.983517,0.054371,0.043481,-0.040032,240276,0.07756,0.301566,0.022514
3,895,-0.668306,"Wirz, Mr. Albert",-0.983517,-0.405875,0.043481,-0.040032,315154,-0.92705,1.610899,-0.193903
4,896,-0.668306,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1.528054,0.324534,0.043481,-0.040032,3101298,0.07756,1.610899,-0.193903


In [70]:
Test_WOE.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,-0.668306,"Kelly, Mr. James",-0.983517,0.054371,0.043481,-0.040032,330911,-0.92705,1.610899,0.022514
1,893,-0.668306,"Wilkes, Mrs. James (Ellen Needs)",1.528054,0.054371,0.043481,-0.040032,363272,-0.92705,1.610899,-0.193903
2,894,0.362662,"Myles, Mr. Thomas Francis",-0.983517,0.054371,0.043481,-0.040032,240276,0.07756,0.301566,0.022514
3,895,-0.668306,"Wirz, Mr. Albert",-0.983517,-0.405875,0.043481,-0.040032,315154,-0.92705,1.610899,-0.193903
4,896,-0.668306,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1.528054,0.324534,0.043481,-0.040032,3101298,0.07756,1.610899,-0.193903


In [71]:
Test_WOE.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [80]:
Test_WOE.loc[Test_WOE['Age']=='NA', 'Age'] = 0.324534

In [81]:
Test_WOE[Test_WOE['Age']=='NA']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [82]:
Test_WOE.to_csv('Test_WOE.csv', index=False)