In [336]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [337]:
df = pd.read_csv('bank-full.csv',sep=';')

In [338]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [339]:
df.shape

(45211, 17)

In [340]:
columns = ['age','job','marital','education','balance','housing',
             'contact','day','month','duration','campaign','pdays',
             'previous','poutcome','y']
df = df[columns]

In [341]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [342]:
df.shape

(45211, 15)

In [343]:
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1

In [344]:
df['education'].mode()

0    secondary
Name: education, dtype: object

### Question 2

### Correlation

In [345]:
numerical = df.copy().select_dtypes(include = 'number')

In [346]:
numerical.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [347]:
numerical.corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [348]:
numerical['age'].corr(numerical['balance'])

0.09778273937134742

In [349]:
numerical['day'].corr(numerical['campaign'])

0.1624902163261929

In [350]:
numerical['day'].corr(numerical['pdays'])

-0.09304407377294044

In [351]:
numerical['pdays'].corr(numerical['previous'])

0.4548196354805016

### Target encoding

In [352]:
df = df.copy()
df['y'] = df['y'].map({'yes':'1','no':'0'})

In [353]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


### split the data

In [354]:
from sklearn.model_selection import train_test_split

In [355]:
seed = 42

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [356]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [357]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [358]:
del df_train['y']
del df_val['y']
del df_test['y']

### Question 3

In [359]:
from sklearn.metrics import mutual_info_score

In [360]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [361]:
categorical = df.copy().select_dtypes(exclude = 'number').columns
categorical

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome', 'y'],
      dtype='object')

In [362]:
categorical_list = ['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome', 'y']

In [363]:
mi = df_full_train[categorical_list].apply(mutual_info_y_score).round(2)
mi.sort_values(ascending=False)

y            0.36
poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

### Question 4

In [364]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

In [365]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [366]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [367]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [368]:
y_pred = model.predict(X_val)

In [369]:
accuracy = np.round(accuracy_score(y_val, y_pred), 2)
print(f'Accuracy = {accuracy}')

Accuracy = 0.9


### Question 5

In [370]:
features = df_train.columns.to_list()
features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [371]:

# Apply the feature elimination technique.
original_score = accuracy
scores = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in features:
    subset = features.copy()
    subset.remove(feature)
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    
    scores.loc[len(scores)] = [feature, score, original_score - score]

In [372]:
scores

Unnamed: 0,eliminated_feature,accuracy,difference
0,age,0.901017,-0.001017
1,job,0.900907,-0.000907
2,marital,0.901017,-0.001017
3,education,0.901128,-0.001128
4,balance,0.900686,-0.000686
5,housing,0.899801,0.000199
6,contact,0.900354,-0.000354
7,day,0.901349,-0.001349
8,month,0.899801,0.000199
9,duration,0.889737,0.010263


In [373]:
scores[scores.difference == scores.difference.min()]

Unnamed: 0,eliminated_feature,accuracy,difference
7,day,0.901349,-0.001349


### Question 6

In [375]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)


In [376]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [378]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [379]:
del df_train['y']
del df_val['y']
del df_test['y']

In [380]:
y_train.shape, y_val.shape

((27126,), (9042,))

In [381]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [382]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [383]:
scores = {}
for alpha in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=alpha, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_val, y_pred)
    scores[alpha] = round(score, 3)
    print(f'alpha = {alpha}:\t RMSE = {score}')

alpha = 0.01:	 RMSE = 0.8978102189781022
alpha = 0.1:	 RMSE = 0.9013492590134926
alpha = 1:	 RMSE = 0.9006856890068569
alpha = 10:	 RMSE = 0.9006856890068569
alpha = 100:	 RMSE = 0.9009068790090687


In [384]:
print(f'The smallest `alpha` is {min(scores, key=scores.get)}.')

The smallest `alpha` is 0.01.
