# Models and Evaluation

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/data.csv')
data

In [None]:
no_ids = [c for c in data.columns if c[-3:] != '_id' and c != 'code']
data = data[no_ids]
data.drop(['account_frequency', 'gender', 'card_type'], axis=1, inplace=True)
display(data.head(10))

categorical_columns = list(data.select_dtypes("object").columns)

def get_features(df):
    return df.drop('status', axis=1).values
def get_target(df):
    return df['status'].values

display(data.info())


results = {}

In [None]:
def split_data(features, target, test_size=0.2, random_state=1):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
def normalize_data(X_train, X_test, scaler):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test

def standardize_data(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    normalize_data(X_train, X_test, StandardScaler())
def min_max_scaling(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler
    normalize_data(X_train, X_test, MinMaxScaler())

In [None]:
def encode_data(df, columns):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in columns:
        if (col in df.keys()):
            df[col] = le.fit_transform(df[col])
    return df

In [None]:
def add_model(name, model):
    df = data.copy()
    if (name == 'dtc'):
        df.drop(['age_on_loan_request_disc'], inplace=True, axis=1)
    else:
        df.drop(['age_on_loan_request'], inplace=True, axis=1)

    df = encode_data(df, categorical_columns)
    X_train, X_test, y_train, y_test = split_data(get_features(df), get_target(df))
    # X_train, X_test = standardize_data(X_train, X_test)
    # X_train, X_test = min_max_scaling(X_train, X_test)

    model.fit(X_train, y_train)
    results[name] = {'model': model, 
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': y_train, 
                    'y_test': y_test}

In [None]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier()
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
add_model('knn',
    KNeighborsClassifier()
)

In [None]:
from sklearn.neural_network import MLPClassifier
add_model('mlp',
    MLPClassifier()
)

In [None]:
from sklearn.naive_bayes import GaussianNB
add_model('gnb',
    GaussianNB()
)

In [None]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True)
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
add_model('rf',
    RandomForestClassifier()
)

## Prediction

In [None]:
def predict(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    pred = result['model'].predict(result['X_'+prefix])
    result[prefix+'pred'] = pred

for name in results.keys():
    predict(name)
for name in results.keys():
    predict(name, isTrain=True)

In [None]:
def predict_proba(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    proba = result['model'].predict_proba(result['X_'+prefix])
    result[prefix+'pred_prob'] = proba

for name in results.keys():
    predict_proba(name)
for name in results.keys():
    predict_proba(name, isTrain=True)


## Evaluation

In [None]:
def recall(cm, i):
    return cm[i][i]/sum(cm[i]) if sum(cm[i]) > 0 else np.inf

def precision(cm, i):
    cmt = np.copy(cm).transpose()
    return cm[i][i]/sum(cmt[i]) if sum(cmt[i]) > 0 else np.inf

def f_measure(cm, i):
    p = precision(cm, i)*100
    r = recall(cm, i)*100
    return 2 * (p * r) / (p + r) if p != np.inf and r != np.inf and p + r > 0 else np.nan

In [None]:
from matplotlib.figure import Figure
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def conf_matrix(y_test, y_pred, name, prefix):
    cm =  confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=['True', 'False'])
    disp.plot()
    disp.ax_.set_title(name + ' ' + prefix)

In [None]:
def score(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    result[prefix+'score'] = result['model'].score(result['X_'+prefix], result['y_'+prefix])
    print(prefix, name.upper()+":\t", result[prefix+'score'])

print('\tAccuracy')
for name in results.keys():
    score(name)
for name in results.keys():
    score(name, isTrain=True)

In [None]:
def evaluate(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    cm =  confusion_matrix(result['y_'+prefix], result[prefix+'pred'])
    print(prefix, name.upper()+":", '\tRecall:',round(recall(cm, 0), 2), '\t Precision:',round(precision(cm, 0), 2), '\tF_Measure:',round(f_measure(cm, 0), 2))

for name in results.keys():
    evaluate(name)
for name in results.keys():
    evaluate(name, isTrain=True)


In [None]:
prob = 0.50

prefixes = ('test', 'train')
for prefix in prefixes[:1]:
    for name in results.keys():
        result = results[name]
        prefix = 'test'
        conf_matrix(result['y_'+prefix], np.where(result[prefix+'pred_prob'][:,-1] > prob, 1, -1), name, prefix)

In [None]:
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
import matplotlib.pyplot as plt

def roc_and_auc(name, isTrain=False):
    fig, ax = plt.subplots()
    for prefix in ('test', 'train'):
        result = results[name]
        fpr, tpr, thresholds = roc_curve(result['y_'+prefix], result[prefix+'pred_prob'][:,0], pos_label=result['model'].classes_[0])
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax,name=(name.upper()+' '+prefix))
        print(name.upper(), prefix + ":\t", auc(fpr, tpr))

for name in results.keys():
    roc_and_auc(name)