In [63]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [64]:
df = pd.read_csv('../datasets/bank-full.csv', sep=';')

In [65]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Features

In [66]:
no_current_features = [
    'default', 'loan'
]

df = df.drop(columns=no_current_features)

df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


Data preparation

In [67]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

What is the most frequent observation (mode) for the column education?

In [53]:
df.education.mode()

0    secondary
Name: education, dtype: object

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [28]:
numerical_features = [
    'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'
]

categorical_features = [
    'job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y'
]

In [33]:
for i in numerical_features:
    print(i)
    print(df[numerical_features].corrwith(df[i]).abs())
    print('---')

age
age         1.000000
balance     0.097783
day         0.009120
duration    0.004648
campaign    0.004760
pdays       0.023758
previous    0.001288
dtype: float64
---
balance
age         0.097783
balance     1.000000
day         0.004503
duration    0.021560
campaign    0.014578
pdays       0.003435
previous    0.016674
dtype: float64
---
day
age         0.009120
balance     0.004503
day         1.000000
duration    0.030206
campaign    0.162490
pdays       0.093044
previous    0.051710
dtype: float64
---
duration
age         0.004648
balance     0.021560
day         0.030206
duration    1.000000
campaign    0.084570
pdays       0.001565
previous    0.001203
dtype: float64
---
campaign
age         0.004760
balance     0.014578
day         0.162490
duration    0.084570
campaign    1.000000
pdays       0.088628
previous    0.032855
dtype: float64
---
pdays
age         0.023758
balance     0.003435
day         0.093044
duration    0.001565
campaign    0.088628
pdays       1.000000
prev

pdays and previous

Target encoding
$$\begin{align}
\bullet\text{Now we want to encode the y variable.} \\
\bullet\text{Let's replace the values yes/no with 1/0.} \\
\end{align}$$

In [68]:
df.y = (df.y == 'yes').astype(int)
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


Split the data

In [84]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [85]:
# dropping the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# getting y
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only

In [86]:
from sklearn.metrics import mutual_info_score

def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

# appling function to each categorical series
mis = df_full_train[categorical_features].apply(mutual_info_y_score).round(2)
mis.sort_values(ascending=False)

y            0.36
poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
education    0.00
marital      0.00
dtype: float64

poutcome