In [58]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
df = pd.read_csv('../datasets/bank-full.csv', sep=';')

In [60]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Features

In [61]:
no_current_features = [
    'default', 'loan'
]

df = df.drop(columns=no_current_features)

df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


Data preparation

In [62]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

What is the most frequent observation (mode) for the column education?

In [63]:
df.education.mode()

0    secondary
Name: education, dtype: object

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [64]:
numerical_features = [
    'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'
]

categorical_features = [
    'job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y'
]

In [65]:
for i in numerical_features:
    print(i)
    print(df[numerical_features].corrwith(df[i]).abs())
    print('---')

age
age         1.000000
balance     0.097783
day         0.009120
duration    0.004648
campaign    0.004760
pdays       0.023758
previous    0.001288
dtype: float64
---
balance
age         0.097783
balance     1.000000
day         0.004503
duration    0.021560
campaign    0.014578
pdays       0.003435
previous    0.016674
dtype: float64
---
day
age         0.009120
balance     0.004503
day         1.000000
duration    0.030206
campaign    0.162490
pdays       0.093044
previous    0.051710
dtype: float64
---
duration
age         0.004648
balance     0.021560
day         0.030206
duration    1.000000
campaign    0.084570
pdays       0.001565
previous    0.001203
dtype: float64
---
campaign
age         0.004760
balance     0.014578
day         0.162490
duration    0.084570
campaign    1.000000
pdays       0.088628
previous    0.032855
dtype: float64
---
pdays
age         0.023758
balance     0.003435
day         0.093044
duration    0.001565
campaign    0.088628
pdays       1.000000
prev

pdays and previous

Target encoding
$$\begin{align}
\bullet\text{Now we want to encode the y variable.} \\
\bullet\text{Let's replace the values yes/no with 1/0.} \\
\end{align}$$

In [66]:
df.y = (df.y == 'yes').astype(int)
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


Split the data

In [67]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [68]:
# dropping the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# getting y
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only

In [69]:
from sklearn.metrics import mutual_info_score

def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

# appling function to each categorical series
mis = df_full_train[categorical_features].apply(mutual_info_y_score).round(2)
mis.sort_values(ascending=False)

y            0.36
poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
education    0.00
marital      0.00
dtype: float64

poutcome

one-hot encoding for categorical variables

In [70]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [71]:
list(X_train[:3][0])

[np.float64(32.0),
 np.float64(1100.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(11.0),
 np.float64(67.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(-1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0)]

Training logistic regression

In [72]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [73]:
model.intercept_[0]

np.float64(-0.9051666407258808)

In [74]:
model.predict_proba(X_val)

array([[0.98747767, 0.01252233],
       [0.98963457, 0.01036543],
       [0.85267327, 0.14732673],
       ...,
       [0.94382865, 0.05617135],
       [0.99059147, 0.00940853],
       [0.71082323, 0.28917677]])

In [75]:
y_pred = model.predict_proba(X_val)[:, 1]

y_decision = (y_pred >= 0.5)

df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_decision.astype(int)
df_pred['actual'] = y_val

df_pred['correct'] = df_pred.prediction == df_pred.actual

df_pred.head()

Unnamed: 0,probability,prediction,actual,correct
0,0.012522,0,0,True
1,0.010365,0,0,True
2,0.147327,0,1,False
3,0.204554,0,0,True
4,0.434557,0,1,False


In [76]:
df_pred.correct.mean().round(4)

np.float64(0.9016)

In [77]:
df_pred.correct.sum()

np.int64(8152)

In [78]:
df_pred[
    df_pred.correct == False
].count()

probability    890
prediction     890
actual         890
correct        890
dtype: int64

In [79]:
df_pred[
    df_pred.correct == True
].count()

probability    8152
prediction     8152
actual         8152
correct        8152
dtype: int64

0.9, accuracy score

Feature elimination

In [48]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

features = numerical_features + [i for i in categorical_features if i != 'y']

for i in features:
    
    less_features = [j for j in features if j != i]
    
    dv = DictVectorizer(sparse=False)
    
    train_dict = df_train[less_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val[less_features].to_dict(orient='records')
    X_val = dv.fit_transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    
    y_decision = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = y_decision.astype(int)
    df_pred['actual'] = y_val
    
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    
    print('Feature:'+ i)
    print(+ df_pred.correct.mean().round(4))


Feature:age
0.9012
Feature:balance
0.9013
Feature:day
0.901
Feature:duration
0.8886
Feature:campaign
0.9009
Feature:pdays
0.901
Feature:previous
0.9012
Feature:job
0.9008
Feature:marital
0.9001
Feature:education
0.901
Feature:housing
0.9015
Feature:contact
0.9006
Feature:month
0.8998
Feature:poutcome
0.8933


Feature:housing has the smallest difference

In [106]:
def prepare_X(df, dv):
    dicts = df.to_dict(orient='records')
    return dv.transform(dicts)

def train_linear_regression_reg(X, y, r=0.001):
    
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T@X
    XTX += r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = (XTX_inv@X.T).dot(y)

    # returning tuple, bias term and weights
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [113]:
# use the X_train before feature elimination step
c = [0, 0.01, 0.1, 1, 10]

# create a DictVectorizer
dv = DictVectorizer(sparse=False)

# fit the DictVectorizer on training data
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

for i in c:
    X_test = prepare_X(df_test, dv)
    
    w0, w = train_linear_regression_reg(X_train, y_train, i)

    y_pred = w0 + X_test.dot(w)
    
    score = rmse(y_test, y_pred)

    print(i)
    print(score.round(6))
    

0
0.904042
0.01
0.270888
0.1
0.270888
1
0.270889
10
0.270906
