In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('bank-full.csv', delimiter=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
columns = """age,
job,
marital,
education,
balance,
housing,
contact,
day,
month,
duration,
campaign,
pdays,
previous,
poutcome,
y"""
columns = columns.split(',\n')
columns

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [4]:
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

#### There are no missing values in columns

In [6]:
df['education'].mode()

0    secondary
Name: education, dtype: object

## Question 1: secondary

In [7]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [8]:
categorical_columns = list( set(df.dtypes[df.dtypes == 'object'].index) - {'y'} ) 
categorical_columns

['education', 'month', 'job', 'contact', 'housing', 'marital', 'poutcome']

In [9]:
numerical_columns = list(set(df.columns) - set(categorical_columns) - {'y'})
numerical_columns

['balance', 'previous', 'age', 'campaign', 'duration', 'pdays', 'day']

In [10]:
df[numerical_columns].corr().round(3)

Unnamed: 0,balance,previous,age,campaign,duration,pdays,day
balance,1.0,0.017,0.098,-0.015,0.022,0.003,0.005
previous,0.017,1.0,0.001,-0.033,0.001,0.455,-0.052
age,0.098,0.001,1.0,0.005,-0.005,-0.024,-0.009
campaign,-0.015,-0.033,0.005,1.0,-0.085,-0.089,0.162
duration,0.022,0.001,-0.005,-0.085,1.0,-0.002,-0.03
pdays,0.003,0.455,-0.024,-0.089,-0.002,1.0,-0.093
day,0.005,-0.052,-0.009,0.162,-0.03,-0.093,1.0


## Question 2: highest correlation is between pdays and previous: 0.455

In [11]:
df['y'].unique()

array(['no', 'yes'], dtype=object)

In [12]:
df['y'] = (df.y == "yes").astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = (df.y == "yes").astype(int)


In [13]:
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [14]:
round(float(df.y.mean()),2)

0.12

### Setting the seed

In [15]:
# np.random.seed(42)

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42) # only splits data into two parts

In [17]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [18]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [19]:
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [20]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [21]:
len(y_train), len(y_val), len(y_test)

(27126, 9042, 9043)

In [22]:
del df_train['y']
del df_val['y']
del df_test['y']

### Mutual information

In [23]:
def mutual_info_churn_score(series):
	return mutual_info_score(series, (y_train==1))

In [24]:
mutual_info_score(df_train.marital, y_train)

np.float64(0.0020495925927810216)

In [25]:
mi = df_train[categorical_columns].apply(mutual_info_churn_score).round(2)
mi.sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
contact      0.01
housing      0.01
education    0.00
marital      0.00
dtype: float64

## Question 3: poutcome has the highest mi with y: 0.03

In [26]:
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
train_dicts[:5]

[{'education': 'tertiary',
  'month': 'aug',
  'job': 'technician',
  'contact': 'cellular',
  'housing': 'yes',
  'marital': 'single',
  'poutcome': 'unknown',
  'balance': 1100,
  'previous': 0,
  'age': 32,
  'campaign': 1,
  'duration': 67,
  'pdays': -1,
  'day': 11},
 {'education': 'secondary',
  'month': 'nov',
  'job': 'entrepreneur',
  'contact': 'cellular',
  'housing': 'yes',
  'marital': 'married',
  'poutcome': 'unknown',
  'balance': 0,
  'previous': 0,
  'age': 38,
  'campaign': 1,
  'duration': 258,
  'pdays': -1,
  'day': 17},
 {'education': 'secondary',
  'month': 'may',
  'job': 'blue-collar',
  'contact': 'cellular',
  'housing': 'yes',
  'marital': 'married',
  'poutcome': 'unknown',
  'balance': 3309,
  'previous': 0,
  'age': 49,
  'campaign': 2,
  'duration': 349,
  'pdays': -1,
  'day': 15},
 {'education': 'primary',
  'month': 'aug',
  'job': 'housemaid',
  'contact': 'cellular',
  'housing': 'no',
  'marital': 'married',
  'poutcome': 'unknown',
  'balance': 

In [27]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [28]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [29]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [30]:
model.fit(X_train, y_train)

In [31]:
val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_pred = model.predict(X_val)

In [32]:
acc = (y_pred == y_val).mean()
acc

np.float64(0.9009068790090687)

In [33]:
acc.round(2)

np.float64(0.9)

## Question 4: Accuracy on validation set: 0.9

In [34]:
tested_features = ["age", "balance", "marital", "previous"]

In [35]:
def model_performance_on_specific_columns(columns_list):
    train_dicts = df_train[columns_list].to_dict(orient='records')
    val_dicts = df_val[columns_list].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    return (y_pred == y_val).mean()

In [36]:
og_model_acc = model_performance_on_specific_columns(categorical_columns + numerical_columns)
print(f"Original model accuracy: {og_model_acc}")
for column_name in tested_features:
    columns_list = list(set(categorical_columns + numerical_columns) - {column_name})
    small_model_acc = model_performance_on_specific_columns(columns_list)
    print(f"{column_name}: acc: {small_model_acc}, diff: {og_model_acc - small_model_acc}")

Original model accuracy: 0.9009068790090687
age: acc: 0.9011280690112807, diff: -0.0002211900022119906
balance: acc: 0.9014598540145985, diff: -0.0005529750055297544
marital: acc: 0.9006856890068569, diff: 0.00022119000221187957
previous: acc: 0.9012386640123866, diff: -0.00033178500331787486


## Question 5: marital has the lowest impact on model performance: 0.00022119000221187957

In [37]:
def reg_lg(reg_par):
    train_dicts = df_train.to_dict(orient='records')
    val_dicts = df_val.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=reg_par, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return (y_pred == y_val).mean().round(3)

In [38]:
C = [0.01, 0.1, 1, 10, 100]

In [39]:
res = []
for c in C:
    res.append({'reg_par': c, 'accuracy': reg_lg(c)})

In [40]:
df = pd.DataFrame(res).set_index('reg_par')
df = df.sort_values(by='accuracy', ascending=False)
df

Unnamed: 0_level_0,accuracy
reg_par,Unnamed: 1_level_1
0.1,0.901
1.0,0.901
10.0,0.901
100.0,0.9
0.01,0.898


## Question 6: highest accuracy is reached with regularization parameter c = 0.1