# Analysis of a experience 

To calculate confusion matrixes and ROC curves

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import seaborn as sns

# set dpi to 300
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

# create images folder
image_path = os.path.join('..', '..', 'images')
if not os.path.exists(image_path):
    os.makedirs(image_path)

In [None]:
path = os.path.join('logs', 'exp00')

In [None]:
results = pd.read_csv(os.path.join(path, 'models_mean_results.csv'))
results

## Confusion matrices

In [None]:
cm_path = os.path.join(path, 'confusion-matrix')

In [None]:
confusion_matrices = {}
for model in results.model:
    cms = []
    for fold in range(3):
        aux = f'model_{model}_fold_{fold}_cm.csv'
        cm = np.genfromtxt(os.path.join(cm_path, aux), delimiter=',')
        cms.append(cm)
    confusion_matrices[model] = cms

In [None]:
for model, cms in confusion_matrices.items():
    aux_cm = np.array(cms).mean(axis=0)
    group_counts = ['{0:0.0f}'.format(value) for value in
                aux_cm.flatten()]
    percentages_cm = (aux_cm.T / aux_cm.sum(axis=1)).T
    group_percentages = ['{0:.2%}'.format(value) for value in
                     percentages_cm.flatten()]
    labels = [f'{v1}\n({v2})' for v1, v2 in
          zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    
    plt.figure(figsize=(15, 10))
    sns.set(font_scale=2.5)
    sns.heatmap(aux_cm, annot=labels, fmt='', cmap='Blues')
    plt.xlabel('Predicted Label', fontdict=dict(size=25))
    plt.ylabel('True Label', fontdict=dict(size=25))
    plt.savefig(os.path.join(image_path, f'average_cm_{model}_regression.png'), transparent=True)

## ROC curves

https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

In [None]:
preds_path = os.path.join(path, 'predictions')
preds_path

In [None]:
preds_values = {}
for model in results.model:
    mean_fpr = np.linspace(0, 1, 100)
    tpr_rates = []
    roc_scores = []
    for fold in range(1, 3+1):
        aux_path = os.path.join(preds_path, f'{model}_fold{fold}_predictions.csv')
        preds_df = pd.read_csv(aux_path)
        roc_score = metrics.roc_auc_score(preds_df.y_test, preds_df.y_pred_proba)
        roc_scores.append(roc_score)
        fpr_proba, tpr_proba, threshold_proba = metrics.roc_curve(preds_df.y_test, preds_df.y_pred_proba)
        interp_tpr = np.interp(mean_fpr, fpr_proba, tpr_proba)
        interp_tpr[0] = 0.0
        tpr_rates.append(interp_tpr)
    mean_tpr = np.mean(tpr_rates, axis=0)
    mean_tpr[-1] = 1.0
    preds_values[model] = {
        'fpr' : mean_fpr,
        'tpr' : mean_tpr,
        'mean' : np.mean(roc_scores),
        'std' : np.std(roc_scores)
    }

In [None]:
sns.set(font_scale=1.5)
plt.figure(figsize=(15, 15))
# set white grid
sns.set_style("whitegrid")
for model, result in preds_values.items():
    lw = 3
    model_name = model.split("_")[0]
    plt.plot(
        result['fpr'], 
        result['tpr'], 
        label=f'Mean ROC (AUC= {result["mean"].round(3)} $\pm$ {result["std"].round(3)}) - {model_name}',
        lw=lw)
plt.legend(fontsize="14")
plt.plot([0, 1], [0, 1], linewidth=2, linestyle='dashed', color = 'g', label='Random Classifier')
    
plt.xlabel('False Positive Rate', fontdict=dict(size=20))
plt.ylabel('True Positive Rate', fontdict=dict(size=20))
plt.savefig(os.path.join(image_path, 'roc_curve.png'), transparent=False)