# Models and Evaluation

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/data.csv')
data

In [None]:
no_ids = [c for c in data.columns if c[-3:] != '_id' and c != 'code']
data = data[no_ids]

categorical_columns = list(data.select_dtypes("object").columns)
print(categorical_columns)

def get_features(df):
    return df.drop('status', axis=1).values
def get_target(df):
    return df['status'].values

display(data.info())


results = {}

In [None]:
def split_data(features, target, test_size=0.2, random_state=1):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
def normalize_data(X_train, X_test, scaler):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test

def standardize_data(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    normalize_data(X_train, X_test, StandardScaler())
def min_max_scaling(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler
    normalize_data(X_train, X_test, MinMaxScaler())

In [None]:
def encode_data(df, columns):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in columns:
        if (col in df.keys()):
            df[col] = le.fit_transform(df[col])
    return df

In [None]:
from sklearn.preprocessing import LabelEncoder

def add_model(name, model):
    df = data.copy()
    if (name == 'dtc'):
        df.drop(['age_on_loan_request_disc'], inplace=True, axis=1)
    else:
        df.drop(['age_on_loan_request'], inplace=True, axis=1)

    df = encode_data(df, categorical_columns)
    X_train, X_test, y_train, y_test = split_data(get_features(df), get_target(df))
    # X_train, X_test = standardize_data(X_train, X_test)
    # X_train, X_test = min_max_scaling(X_train, X_test)

    model.fit(X_train, y_train)
    results[name] = {'model': model, 
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': y_train, 
                    'y_test': y_test}

In [None]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier()
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
add_model('rf',
    RandomForestClassifier()
)

In [None]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True)
)

In [None]:
def predict(name):
    result = results[name]
    pred = result['model'].predict(result['X_test'])
    result['pred'] = pred

for name in results.keys():
    predict(name)

In [None]:
def predict_proba(name):
    result = results[name]
    proba = result['model'].predict_proba(result['X_test'])
    result['pred_prob'] = proba

for name in results.keys():
    predict_proba(name)


## Evaluation

In [None]:
def recall(cm, i):
    return cm[i][i]/sum(cm[i])

def precision(cm, i):
    cmt = np.copy(cm).transpose()
    return cmt[i][i]/sum(cmt[i])

def f_measure(cm, i):
    p = precision(cm, i)*100
    r = recall(cm, i)*100
    return 2 * (p * r) / (p + r)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def conf_matrix(y_test, y_pred):
    cm =  confusion_matrix(y_test, y_pred)
    '''
    print("TP:", cm[1][1])
    print("TN:", cm[0][0])
    print("FP:", cm[0][1])
    print("FN:", cm[1][0])
    '''
    ConfusionMatrixDisplay(cm, display_labels=['True', 'False']).plot()

In [None]:
def score(name):
    result = results[name]
    result['score'] = result['model'].score(result['X_test'], result['y_test'])
    print(name.upper()+":", result['score'])

for name in results.keys():
    score(name)

In [None]:
prob = 0.50

for name in results.keys():
    result = results[name]
    conf_matrix(result['y_test'], np.where(result['pred_prob'][:,-1] > prob, 1, -1))

In [None]:
from sklearn.metrics import roc_curve, RocCurveDisplay, auc

def roc_and_auc(name):
    result = results[name]
    fpr, tpr, thresholds = roc_curve(result['y_test'], result['pred_prob'][:,0], pos_label=result['model'].classes_[0])
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    return print(name.upper() + ":", auc(fpr, tpr))

for name in results.keys():
    roc_and_auc(name)