In [104]:

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [105]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [106]:

df = pd.read_csv('bank+marketing/bank/bank-full.csv', sep=';')
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

In [107]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [108]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [109]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [110]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values


del df_val['y']
del df_test['y']
del df_train['y']

In [111]:
numerical = ['balance','day','duration','previous']

### Question 1: ROC AUC Feature Importance

In [116]:

dv = DictVectorizer(sparse=False)

actual_positive = (y_val == 1)
actual_negative = (y_val == 0)

t = 0.5


for item in numerical:
    train_dict = df_train[[item]].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    val_dict = df_val[[item]].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    
    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)
    
    
    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()

    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()
    
    
    
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    
    
 
    
    print(item,(y_val == churn_decision).mean(),auc(fpr,tpr))
 


balance 0.878345498783455 0.5995068851725284
day 0.8785666887856669 0.5352198426324892
duration 0.8836540588365406 0.7965344730967409
previous 0.8775713337757134 0.607255799705406


In [119]:
df.dtypes

numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = ['job','marital','education','default','housing','loan','contact','month','poutcome']
 

### Question 2: Training the Model

In [121]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)
round((y_val == y_decision).mean(),3)

0.898

0.5995068851725284

0.878345498783455