In [2]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display

In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [4]:
import pickle

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [6]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [8]:
df=pd.read_csv('bank-additional-full.csv',sep=';')

FileNotFoundError: [Errno 2] No such file or directory: 'bank-additional-full.csv'

In [None]:
df.head()

In [1]:
df.info()

NameError: name 'df' is not defined

In [32]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [33]:
categorical_columns

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome',
 'y']

In [34]:
categorical=['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [35]:
df.y = (df.y == 'yes').astype(int)

In [36]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [39]:
numerical=['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']
categorical=['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

## Logistic Regression

In [40]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [15]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [16]:
C = 1.0
n_splits = 5

In [41]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.y.values
    y_val = df_val.y.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)


In [43]:
print('%.3f +- %.3f'%(np.mean(scores),np.std(scores)))

0.932 +- 0.003


In [46]:
dv, model = train(df_full_train, df_full_train.y.values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.y.values
auc = roc_auc_score(y_test, y_pred)
auc

0.9316332090922189

In [50]:
output_file=f'model_C={C}.bin'

In [51]:
f_out=open(output_file,'wb') #write, binary
pickle.dump((dv,model),f_out)
f_out.close()

## Load the model

In [52]:
with open(output_file,'rb') as f_in:
    dv,model=pickle.load(f_in)
    #auto closed the file

In [53]:
dv,model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [78]:
np.set_printoptions(threshold=sys.maxsize)

In [83]:
set(df.marital.values)

{'divorced', 'married', 'single', 'unknown'}

In [84]:
customer={'age': 27,
 'job': 'student',
 'marital': 'single',
 'education': 'university.degree',
 'default': 'yes',
 'housing': 'no',
 'loan': 'no',
 'contact': 'cellular',
 'month': 'may',
 'day_of_week': 'mon',
 'duration': 55,
 'campaign': 3,
 'pdays': 999,
 'previous': 0,
 'poutcome': 'nonexistent',
 'emp.var.rate': 1.1,
 'cons.price.idx': 93.994,
 'cons.conf.idx': -36.4,
 'euribor3m': 4.857,
 'nr.employed': 5191.0,
 'y': 0}

In [85]:
X=dv.transform([customer])

In [86]:
model.predict_proba(X)[0,1]

0.006569082116736781

## Decision Tree

In [101]:
df_train.shape

(26360, 21)

In [102]:
df_val.shape

(6590, 21)

In [105]:
dict_train = df_train[categorical + numerical].to_dict(orient='records')
dict_val = df_val[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val=dv.transform(dict_val)

In [106]:
X_train.shape

(26360, 63)

In [107]:
X_val.shape

(6590, 63)

In [108]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [111]:
y_pred=dt.predict_proba(X_val)[:,1]
roc_auc_score(y_val, y_pred)

0.7234557634557636

In [112]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)

y_pred = dt.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_train, y_pred)
print('train auc: %.3f' % auc)

y_pred = dt.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print('val auc: %.3f' % auc)

train auc: 0.856
val auc: 0.864


### Parameter tuning: max_depth

In [114]:
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print('%4s -> %.3f' % (depth, auc))

   1 -> 0.703
   2 -> 0.864
   3 -> 0.898
   4 -> 0.921
   5 -> 0.931
   6 -> 0.925
  10 -> 0.863
  15 -> 0.763
  20 -> 0.736
None -> 0.721


In [116]:
for m in [1, 5, 10, 15, 20, 50, 100, 200]:
    dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=m)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print('%s -> %.3f' % (m, auc))

1 -> 0.931
5 -> 0.929
10 -> 0.931
15 -> 0.930
20 -> 0.931
50 -> 0.926
100 -> 0.928
200 -> 0.929


In [118]:
for m in [1, 5, 10, 20]:
    print('depth: %s' % m)

    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=s)
        dt.fit(X_train, y_train)
        y_pred = dt.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        print('%s -> %.3f' % (s, auc))
    
    print()

depth: 1
1 -> 0.703
5 -> 0.703
10 -> 0.703
15 -> 0.703
20 -> 0.703
50 -> 0.703
100 -> 0.703
200 -> 0.703

depth: 5
1 -> 0.931
5 -> 0.929
10 -> 0.931
15 -> 0.930
20 -> 0.931
50 -> 0.926
100 -> 0.928
200 -> 0.929

depth: 10
1 -> 0.857
5 -> 0.887
10 -> 0.905
15 -> 0.920
20 -> 0.928
50 -> 0.940
100 -> 0.939
200 -> 0.941

depth: 20
1 -> 0.726
5 -> 0.839
10 -> 0.894
15 -> 0.911
20 -> 0.920
50 -> 0.935
100 -> 0.937
200 -> 0.939



In [119]:
for m in [1, 5, 10, 15, 20, 50, 100, 200]:
    dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=m)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print('%s -> %.3f' % (m, auc))

1 -> 0.931
5 -> 0.929
10 -> 0.931
15 -> 0.930
20 -> 0.931
50 -> 0.926
100 -> 0.928
200 -> 0.929


In [124]:
dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=20)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred_dt)

0.9309961884961886