# Models and Evaluation

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

ALL_DATA = False
use_smote = True

if ALL_DATA: data = pd.read_csv('data_processed/complete/enc_data.csv')
else: data = pd.read_csv('data_processed/complete/data_selected.csv').drop('loan_id', axis=1)

display(data.head())

In [None]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df['status']

results = {}

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

def make_pipeline(classifier, smote = False):
    steps = []
    if smote: steps.append(['smote', SMOTE(random_state=1, sampling_strategy=1.0)])
    steps.append(['classifier', classifier])
    return imbpipeline(steps = steps)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

stratified_kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
from sklearn.model_selection import cross_validate

def cross_validation(name, model, df=data, cv=stratified_kfold):
    model = make_pipeline(model, smote=use_smote)
    scores = cross_validate(model, get_features(df), get_target(df), scoring='roc_auc', cv=cv)['test_score']

    results[name] = scores
    print(name, scores.mean(), scores.std())

### Algorithms

In [None]:
from sklearn.tree import DecisionTreeClassifier
cross_validation('DT',
    DecisionTreeClassifier(
        criterion= "entropy",
        max_depth= 17,
        max_features= "sqrt",
        max_leaf_nodes= 13,
        min_samples_leaf= 4,
        min_samples_split= 2
    ),
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
cross_validation('KNN',
    KNeighborsClassifier(
        algorithm= "auto",
        n_neighbors= 11,
        weights= "distance"
    ),
)

In [None]:
from sklearn.neural_network import MLPClassifier
cross_validation('MLP',
    MLPClassifier(),
)

In [None]:
from sklearn.naive_bayes import GaussianNB
cross_validation('GNB',
    GaussianNB(),
)

In [None]:
from sklearn.svm import SVC
cross_validation('SVM',
    SVC(
        probability=True,
        max_iter=10000,
        C= 1,
        coef0= 0.0,
        degree= 3,
        gamma= 'scale',
        kernel= 'poly',
        shrinking= True,
        tol= 0.0001
    ),
)

In [None]:
from sklearn.linear_model import LogisticRegression
cross_validation('LR',
    LogisticRegression(
        max_iter=10000,
        C= 100,
        fit_intercept= False,
        solver= 'newton-cg',
        tol= 0.001
    )
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
cross_validation('RF',
    RandomForestClassifier(
        criterion= 'entropy',
        max_depth= 18,
        max_features= 'sqrt',
        n_estimators= 40
    ),
)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
cross_validation('GB',
    GradientBoostingClassifier(),
)

### Evaluation

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sb.boxplot(data=pd.DataFrame(results))
sb.stripplot(data=pd.DataFrame(results), color='black')
plt.title('AUC for each model with CV and SMOTE')
plt.show()

In [None]:
for name, scores in results.items():
    print(name, scores.mean())