### Evaluating classification models

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold


from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
# Creating the target variable

card_values = {
    "yes": 1,
    "no": 0
}
data["card"] = data.card.map(card_values)
data.head(10)

In [None]:
# Initializing numerical and categorical variables

numerical = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active"]
categorical = ["owner", "selfemp"]

In [None]:
# splitting the dataset

full_train, test = train_test_split(data, test_size=0.2, random_state=1)
train, val = train_test_split(full_train, test_size=0.25, random_state=1)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

y_train = train.card
y_val = val.card
y_test = test.card

del train['card']
del val['card']
del test['card']

In [None]:
# calculate ROC AUC score

for c in numerical:
    auc = roc_auc_score(y_train, train[c])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -train[c])
    print('%9s, %.3f' % (c, auc))

In [None]:
# plot the ROC curve

plt.figure(figsize=(5, 5))

fpr, tpr, _ = roc_curve(y_train, train.share)
plt.plot(fpr, tpr, label='-share')

plt.plot([0, 1], [0, 1], color='grey', linestyle='--')

plt.legend()
plt.show()

### Train the model


In [None]:
# Train the logistic regression model

columns = categorical + numerical 

train_dicts = train[columns].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(train_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(x_train, y_train)

val_dicts = val[columns].to_dict(orient='reocrds')
x_val = dv.transform(val_dicts)

y_pred = model.predict_proba(x_val)[:, 1]

In [None]:
# Inspecting ROC AUC Score
roc_auc_score(y_val, y_pred)

In [None]:
y_pred_bin = model.predict(x_val)
roc_auc_score(y_val, y_pred_bin)

In [None]:
# Inspecting the ROC Curve
from matplotlib.pyplot import plot


plt.figure(figsize=(5, 5))

fpr, tpr, _ = roc_curve(y_val, y_pred)
plt.plot(fpr, tpr, label='probability') 

fpr, tpr, _ = roc_curve(y_val, y_pred_bin)
plt.plot(fpr, tpr, label='hard prediction')

plt,plot([0, 1], [0, 1], color='grey', linestyle='--')

plt.legend()
plt.show()

In [None]:
# Creating confusion matrix
def confusion_matrix(y_val, y_pred):
    scores = []

    thresholds = np.linspace(0, 1, 101)

    for t in thresholds:
        actual_positive = (y_val == 1)
        actual_negative = (y_val == 0)

        predict_positive = (y_pred >= t)
        predict_negative = (y_val < t)

        tp = (predict_positive & actual_negative).sum()
        tn = (predict_negative & actual_negative).sum()

        fp = (predict_positive & actual_positive).sum()
        fn = (predict_negative & actual_negative).sum()

        scores.append((t, tp, fp, fn, tn))

    columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
    df_scores = pd.DataFrame(scores, columns=columns)

    return df_scores

df_scores = confusion_matrix(y_val, y_pred)
df_scores[::10]

In [None]:
# calculating precision and recall
df_scores['p'] = df_scores.tp / (df_scores.tp + df_scores.fp)
df_scores['r'] = df_scores.tp / (df_scores.tp + df_scores.fn)

In [None]:
# choosing a threshold
plt.plot(df_scores.thresholds, df_scores.p, label='precision')
plt.plot(df_scores.thresholds, df_scores.r, label='recall')

plt.legend()
plt.show()

In [None]:
# Inspecting the F1 score
plt.plot(df_scores.thresholds, df_scores.f1)
plt.xticks(np.linspace(0, 1, 11))
plt.show()

In [None]:
# Defining train and prediction functions

def train(train, y_train, C=1.0):
    dicts = train[columns].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dicts)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(x_train, y_train)

    return dv, model

def predict(df, dv, model):

    dicts = df[columns].to_dict(orient='reocrds')

    x = dv.transform(dicts)
    y_pred = model.predict_proba(x)[:, 1]


    return y_pred

In [None]:
# Initializing k-fold cross validation

scores = []

kfold = KFold(n_splits=5, shuffle=True, random_state=1)

for C in [0.01, 0.1, 1, 10]:
    scores = []

    for train_idx, val_idx in kfold.split(full_train):
        train = full_train.iloc[train_idx]
        val = full_train.iloc[val_idx]

        y_train = train.card
        y_val = val.card

        dv, model = train(train, y_train, C=C)
        y_pred = predict(val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    print('C=%4s, %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))
