In [150]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [122]:
df = pd.read_csv('bank-full.csv', delimiter=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [123]:
columns = """age,
job,
marital,
education,
balance,
housing,
contact,
day,
month,
duration,
campaign,
pdays,
previous,
poutcome,
y"""
columns = columns.split(',\n')
columns

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [124]:
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [125]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

#### There are no missing values in columns

In [126]:
df['education'].mode()

0    secondary
Name: education, dtype: object

## Question 1: secondary

In [127]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [128]:
categorical_columns = list( set(df.dtypes[df.dtypes == 'object'].index) - {'y'} ) 
categorical_columns

['marital', 'month', 'education', 'job', 'poutcome', 'housing', 'contact']

In [129]:
numerical_columns = list(set(df.columns) - set(categorical_columns) - {'y'})
numerical_columns

['campaign', 'duration', 'pdays', 'balance', 'previous', 'day', 'age']

In [130]:
df[numerical_columns].corr().round(3)

Unnamed: 0,campaign,duration,pdays,balance,previous,day,age
campaign,1.0,-0.085,-0.089,-0.015,-0.033,0.162,0.005
duration,-0.085,1.0,-0.002,0.022,0.001,-0.03,-0.005
pdays,-0.089,-0.002,1.0,0.003,0.455,-0.093,-0.024
balance,-0.015,0.022,0.003,1.0,0.017,0.005,0.098
previous,-0.033,0.001,0.455,0.017,1.0,-0.052,0.001
day,0.162,-0.03,-0.093,0.005,-0.052,1.0,-0.009
age,0.005,-0.005,-0.024,0.098,0.001,-0.009,1.0


## Question 2: highest correlation is between pdays and previous: 0.455

In [131]:
df['y'].unique()

array(['no', 'yes'], dtype=object)

In [132]:
df['y'] = (df.y == "yes").astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = (df.y == "yes").astype(int)


In [133]:
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [134]:
round(float(df.y.mean()),2)

0.12

### Setting the seed

In [135]:
np.random.seed(42)

In [136]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1) # only splits data into two parts

In [137]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [138]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [139]:
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [140]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [141]:
len(y_train), len(y_val), len(y_test)

(27126, 9042, 9043)

In [142]:
del df_train['y']
del df_val['y']
del df_test['y']

### Mutual information

In [146]:
def mutual_info_churn_score(series):
	return mutual_info_score(series, (y_train==1))

In [147]:
mutual_info_score(df_train.marital, y_train)

np.float64(0.0020424064629195673)

In [149]:
mi = df_train[categorical_columns].apply(mutual_info_churn_score).round(2)
mi.sort_values(ascending=False)

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

## Question 3: poutcome has the highest mi with y: 0.03

In [153]:
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
train_dicts

[{'marital': 'married',
  'month': 'jul',
  'education': 'tertiary',
  'job': 'entrepreneur',
  'poutcome': 'unknown',
  'housing': 'no',
  'contact': 'cellular',
  'campaign': 2,
  'duration': 123,
  'pdays': -1,
  'balance': 40,
  'previous': 0,
  'day': 11,
  'age': 56},
 {'marital': 'married',
  'month': 'apr',
  'education': 'secondary',
  'job': 'technician',
  'poutcome': 'failure',
  'housing': 'yes',
  'contact': 'cellular',
  'campaign': 1,
  'duration': 137,
  'pdays': 272,
  'balance': 4790,
  'previous': 2,
  'day': 20,
  'age': 33},
 {'marital': 'single',
  'month': 'nov',
  'education': 'tertiary',
  'job': 'technician',
  'poutcome': 'unknown',
  'housing': 'yes',
  'contact': 'cellular',
  'campaign': 2,
  'duration': 148,
  'pdays': -1,
  'balance': 392,
  'previous': 0,
  'day': 21,
  'age': 32},
 {'marital': 'divorced',
  'month': 'apr',
  'education': 'secondary',
  'job': 'admin.',
  'poutcome': 'unknown',
  'housing': 'no',
  'contact': 'cellular',
  'campaign': 

In [160]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [161]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [162]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [165]:
model.fit(X_train, y_train)

In [168]:
val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_pred = model.predict(X_val)

In [170]:
round((y_pred == y_val).mean(),2)

np.float64(0.9)

## Question 4: Accuracy on validation set: 0.9

In [171]:
tested_features = ["age", "balance", "marital", "previous"]

In [184]:
def model_performance_on_specific_columns(columns_list):
    train_dicts = df_train[columns_list].to_dict(orient='records')
    val_dicts = df_val[columns_list].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    return (y_pred == y_val).mean()

In [185]:
og_model_acc = model_performance_on_specific_columns(categorical_columns + numerical_columns)

for column_name in tested_features:
    columns_list = list(set(categorical_columns + numerical_columns) - {column_name})
    small_model_acc = model_performance_on_specific_columns(columns_list)
    print(f"{column_name}: acc: {small_model_acc}, diff: {og_model_acc - small_model_acc}")

age: acc: 0.8976996239769962, diff: 0.0005529750055297544
balance: acc: 0.8984737889847378, diff: -0.00022119000221187957
marital: acc: 0.898252598982526, diff: 0.0
previous: acc: 0.8979208139792081, diff: 0.00033178500331787486


## Question 5: marital has the lowest impact on model performance: 0.0

In [194]:
LogisticRegression?

[0;31mInit signature:[0m
[0mLogisticRegression[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpenalty[0m[0;34m=[0m[0;34m'l2'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdual[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mC[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfit_intercept[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mintercept_scaling[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolver[0m[0;34m=[0m[0;34m'lbfgs'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmulti_class[0m[0;34m=[0m[

In [200]:
def reg_lg(reg_par):
    train_dicts = df_train[columns_list].to_dict(orient='records')
    val_dicts = df_val[columns_list].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=reg_par, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    return (y_pred == y_val).mean().round(4)

In [205]:
C = [0.01, 0.1, 1, 10, 100]

In [206]:
res = []
for c in C:
    res.append({'reg_par': c, 'accuracy': reg_lg(c)})

In [209]:
df = pd.DataFrame(res).set_index('reg_par')
df = df.sort_values(by='accuracy', ascending=False)
df

Unnamed: 0_level_0,accuracy
reg_par,Unnamed: 1_level_1
10.0,0.8988
1.0,0.8983
100.0,0.8983
0.1,0.8974
0.01,0.8946


## Question 6: highest accuracy is reached with regularization parameter c = 10