# Models and Evaluation

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/complete/data_selected.csv')

display(data.head())

In [None]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['status']), axis=1)

results = {}

### Split data

In [None]:
def split_data(features, target, test_size=0.25, random_state=1):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
print('Data shape:',data.shape)
print('Status  1:',data[data['status']==1].shape)
print('Status -1:',data[data['status']==-1].shape)

_X_train, _X_test, _y_train, _y_test = split_data(get_features(data), get_target(data))

print('\nTrain shape:',_y_train.shape)
print('Status ratio:',_y_train[_y_train['status']==1].shape[0],'|',_y_train[_y_train['status']==-1].shape[0])

print('\nTest shape:',_y_test.shape)
print('Status ration:',_y_test[_y_test['status']==1].shape[0],'|',_y_test[_y_test['status']==-1].shape[0])

### Resampling

In [None]:
def oversample(X_train, y_train):
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=1, sampling_strategy=1.0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [None]:
_smote_X_train, _smote_y_train = oversample(_X_train, _y_train)

print('\nTrain shape:',_smote_y_train.shape)
print('Status ratio:',_smote_y_train[_smote_y_train['status']==1].shape[0],'|',_smote_y_train[_smote_y_train['status']==-1].shape[0])

### Scaling

In [None]:
def normalize_data(X_train, X_test, scaler):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test

def standardize_data(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    return normalize_data(X_train, X_test, StandardScaler())
def min_max_scaling(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler
    return normalize_data(X_train, X_test, MinMaxScaler())

### Algorithms

In [None]:
def add_model(name, model, use_smote=False, scale=False):
    import datetime
    X_train = _X_train.copy() if not use_smote else _smote_X_train.copy()
    y_train = _y_train.copy() if not use_smote else _smote_y_train.copy()
    X_test = _X_test.copy()

    # Scaling Normalizations
    if scale == 'standard':
        X_train, X_test = standardize_data(X_train, X_test)
    elif scale == 'min_max':
        X_train, X_test = min_max_scaling(X_train, X_test)
    

    start = datetime.datetime.now()
    model.fit(X_train, np.ravel(y_train.values))
    end = datetime.datetime.now()
    
    results[name] = {'model': model, 
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': np.ravel(y_train.values), 
                    'y_test': np.ravel(_y_test.values),
                    'fit_time': (end - start).microseconds / 1000 # time difference in milliseconds
                    }

In [None]:
SMOTE = True

In [None]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
add_model('knn',
    KNeighborsClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.neural_network import MLPClassifier
add_model('mlp',
    MLPClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.naive_bayes import GaussianNB
add_model('gnb',
    GaussianNB(),
    use_smote=SMOTE
)

In [None]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True),
    use_smote=SMOTE,
    scale='standard'
)

In [None]:
from sklearn.linear_model import LogisticRegression
add_model('lr',
    LogisticRegression(),
    use_smote=SMOTE
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
add_model('rf',
    RandomForestClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
add_model('gb',
    GradientBoostingClassifier(),
    use_smote=SMOTE
)

## Prediction

In [None]:
def predict(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    pred = result['model'].predict(result['X_'+prefix])
    result[prefix+'pred'] = pred

for name in results.keys():
    predict(name)
for name in results.keys():
    predict(name, isTrain=True)

In [None]:
def predict_proba(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    proba = result['model'].predict_proba(result['X_'+prefix])
    result[prefix+'pred_prob'] = proba

for name in results.keys():
    predict_proba(name)
for name in results.keys():
    predict_proba(name, isTrain=True)


## Evaluation

In [None]:
from matplotlib.figure import Figure
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def conf_matrix(y_test, y_pred, name, prefix):
    cm =  confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=['True', 'False'])
    disp.plot()
    disp.ax_.set_title(name + ' ' + prefix)

### Execution time

In [None]:
def fit_time(name):
    result = results[name]
    print(name.upper()+':\t', result['fit_time'], 'ms')

print('Fit execution time')
for name in results.keys():
    fit_time(name)

### Accuracy

In [None]:
def score(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    result[prefix+'score'] = result['model'].score(result['X_'+prefix], result['y_'+prefix])
    print(name.upper()+":\t", result[prefix+'score'])

print('\tAccuracy')
print("Test")
for name in results.keys():
    score(name)
print("\nTrain")
for name in results.keys():
    score(name, isTrain=True)

### Precision, Recall and F-measure

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score


def evaluate(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    precision = precision_score(result['y_'+prefix], result[prefix+'pred'], pos_label=-1)
    recall = recall_score(result['y_'+prefix], result[prefix+'pred'], pos_label=-1)
    f_measure = f1_score(result['y_'+prefix], result[prefix+'pred'], pos_label=-1)
    print(prefix, name.upper()+":", '\tRecall:',round(recall, 2), '\t Precision:',round(precision, 2), '\tF_Measure:',round(f_measure, 2))

print('Test')
for name in results.keys():
    evaluate(name)
print('\nTrain')
for name in results.keys():
    evaluate(name, isTrain=True)


### Confusion Matrix

In [None]:
prob = 0.50

prefixes = ('test', 'train')
for prefix in prefixes[:1]:
    for name in results.keys():
        result = results[name]
        prefix = 'test'
        conf_matrix(result['y_'+prefix], np.where(result[prefix+'pred_prob'][:,-1] > prob, 1, -1), name, prefix)

### AUC

In [None]:
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
import matplotlib.pyplot as plt

def roc_and_auc(name, isTrain=False):
    fig, ax = plt.subplots()
    for prefix in ('test', 'train'):
        result = results[name]
        fpr, tpr, thresholds = roc_curve(result['y_'+prefix], result[prefix+'pred_prob'][:,0], pos_label=result['model'].classes_[0])
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax,name=(name.upper()+' '+prefix))
        print(name.upper(), prefix + ":\t", auc(fpr, tpr))

for name in results.keys():
    roc_and_auc(name)