# Evaluation

## Implementation

In [1]:
import numpy as np
from typing import Dict
from itertools import product
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, balanced_accuracy_score, \
    confusion_matrix, precision_recall_fscore_support, \
    roc_auc_score, matthews_corrcoef, average_precision_score, \
    log_loss, brier_score_loss
import scipy.stats.mstats as ms
from Funcs.Utility import *


def evaluate(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_prob: np.ndarray,
    classes: np.ndarray
) -> Dict[str, any]:

    R = {}
    n_classes = len(classes)
    is_multiclass = n_classes > 2
    is_same_y = len(np.unique(y_true)) == 1
    R['inst'] = len(y_true)
    
    for c in classes:
        R[f'inst_{c}'] = np.sum(y_true == c)
        
    if not is_multiclass:
        _, cnt = np.unique(y_true, return_counts=True)
        
        if len(cnt) > 1:
            R['class_ratio'] = cnt[0] / cnt[1]
        else:
            R['class_ratio'] = np.nan

    C = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=classes)
    for (i1, c1), (i2, c2) in product(enumerate(classes), enumerate(classes)):
        R[f'true_{c1}_pred_{c2}'] = C[i1, i2]

    # Threshold Measure
    R['acc'] = accuracy_score(y_true=y_true, y_pred=y_pred)
    R['bac'] = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    R['gmean'] = ms.gmean(np.diag(C) / np.sum(C, axis=1))
    R['mcc'] = matthews_corrcoef(y_true=y_true, y_pred=y_pred)
    
    if is_multiclass:
        for avg in ('macro', 'micro'):
            pre, rec, f1, _ = precision_recall_fscore_support(
                y_true=y_true,
                y_pred=y_pred,
                labels=classes,
                average=avg, 
                zero_division=0
            )
            R[f'pre_{avg}'] = pre
            R[f'rec_{avg}'] = rec
            R[f'f1_{avg}'] = f1
    else:
        pre, rec, f1, _ = precision_recall_fscore_support(
            y_true=y_true, y_pred=y_pred, pos_label=1, average='macro', zero_division=0
        )
        R[f'pre_macro'] = pre
        R[f'rec_macro'] = rec
        R[f'f1_macro'] = f1
        
        for c in classes:
            pre, rec, f1, _ = precision_recall_fscore_support(
                y_true=y_true, y_pred=y_pred, pos_label=c, average='binary', zero_division=0
            )
            R[f'pre_{c}'] = pre
            R[f'rec_{c}'] = rec
            R[f'f1_{c}'] = f1



    # ...

    # Ranking Measure
    if is_multiclass:
        if y_prob is not None:
            for avg, mc in product(('macro', 'micro'), ('ovr', 'ovo')):
                R[f'roauc_{avg}_{mc}'] = roc_auc_score(
                    y_true=y_true, y_score=y_prob,
                    average=avg, multi_class=mc, labels=classes
                ) if not is_same_y else np.nan
        else:
            for avg, mc in product(('macro', 'micro'), ('ovr', 'ovo')):
                R[f'roauc_{avg}_{mc}'] = np.nan
    else:
        if y_prob is not None:
            R[f'roauc'] = roc_auc_score(
                y_true=y_true, y_score=y_prob[:, 1], average=None
            ) if not is_same_y else np.nan
            for i, c in enumerate(classes):
                R[f'prauc_{c}'] = average_precision_score(
                    y_true=y_true, y_score=y_prob[:, i], pos_label=c, average=None
                ) 
                R[f'prauc_ref_{c}'] = np.sum(y_true == c) / len(y_true)
        else:
            R[f'roauc'] = np.nan
            for c in classes:
                R[f'prauc_{c}'] = np.nan
                R[f'prauc_ref_{c}'] = np.nan

    # Probability Measure
    if y_prob is not None:
        R['log_loss'] = log_loss(y_true=y_true, y_pred=y_prob, labels=classes, normalize=True)
        if not is_multiclass:
            R[f'brier_loss'] = brier_score_loss(
                y_true=y_true, y_prob=y_prob[:, 1], pos_label=classes[1]
            )
    else:
        R['log_loss'] = np.nan
        if not is_multiclass:
            R[f'brier_loss'] = np.nan

    return R

## Execution

In [2]:
# import os
# import pandas as pd

# RESULTS_EVAL = []
# DIR_EVAL = os.path.join(PATH_INTERMEDIATE, 'eval')
# threshold = 0.5

# # Loop through the desired labels
# for l in ['stress']:
#     dir_l = os.path.join(DIR_EVAL, l)
#     if not os.path.exists(dir_l):
#         continue

#     for f in os.listdir(dir_l):
#         if f == '.ipynb_checkpoints':
#             continue

#         model, pid = f[:f.index('.pkl')].split('#')
#         res = load(os.path.join(dir_l, f))
#         X, y = res.X_test, res.y_test
 
#         # Perform predictions for other classifiers (without datetimes)
#         y_pred = res.estimator.predict(X)

#         if hasattr(res.estimator, 'predict_proba'):
#             y_prob = res.estimator.predict_proba(X)
#         else:
#             y_prob = None

#         ev_test = evaluate(
#             y_true=y,
#             y_pred=y_pred,
#             y_prob=y_prob,
#             classes=[0, 1]
#         )

#         X, y = res.X_train, res.y_train

#         # Perform predictions for other classifiers (without datetimes)
#         y_pred = res.estimator.predict(X)

#         if hasattr(res.estimator, 'predict_proba'):
#             y_prob = res.estimator.predict_proba(X)
#         else:
#             y_prob = None
        
        
#         ev_train = evaluate(
#             y_true=y,
#             y_pred=y_pred,
#             y_prob=y_prob,
#             classes=[0, 1]
#         )

#         RESULTS_EVAL.append({
#             'label': l,
#             'alg': model,
#             'split': pid,
#             'n_feature': len(X.columns),
#             **{f'test_{k}': v for k, v in ev_test.items()},
#             **{f'train_{k}': v for k, v in ev_train.items()}
#         })

# RESULTS_EVAL = pd.DataFrame(RESULTS_EVAL)
# RESULTS_EVAL.head()

In [7]:
import os
import pandas as pd
import ray


@ray.remote
def process_file(f, dir_l, l):
    model, pid = f[:f.index('.pkl')].split('#')
    res = load(os.path.join(dir_l, f))
    results = []
    
    for phase, data in [('test', (res.X_test, res.y_test)), ('train', (res.X_train, res.y_train))]:
        X, y = data
        y_pred = res.estimator.predict(X)
        
        if hasattr(res.estimator, 'predict_proba'):
            y_prob = res.estimator.predict_proba(X)
        else:
            y_prob = None
            
        ev = evaluate(
            y_true=y,
            y_pred=y_pred,
            y_prob=y_prob,
            classes=[0, 1]
        )
        
        result = {
            'label': l,
            'alg': model,
            'split': pid,
            'phase': phase,
            'n_feature': len(X.columns),
            **{f'{phase}_{k}': v for k, v in ev.items()}
        }
        results.append(result)
    
    return results

DIR_EVAL = os.path.join(PATH_INTERMEDIATE, 'eval')
RESULTS_EVAL = []

futures = []
with on_ray():
    for l in ['stress']:
        dir_l = os.path.join(DIR_EVAL, l)
        if not os.path.exists(dir_l):
            continue
        
        for f in os.listdir(dir_l):
            if f == '.ipynb_checkpoints':
                continue
            
            future = process_file.remote(f, dir_l, l)
            futures.append(future)

    # Collect all results
    results = ray.get(futures)
    for result_list in results:
        RESULTS_EVAL.extend(result_list)

    RESULTS_EVAL = pd.DataFrame(RESULTS_EVAL)
    print(RESULTS_EVAL.head())




2024-04-24 13:31:57,102	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


    label     alg split  phase  n_feature  test_inst  test_inst_0  \
0  stress  xgb_os   P20   test        165      520.0        297.0   
1  stress  xgb_os   P20  train        165        NaN          NaN   
2  stress  xgb_os   P19   test        167      619.0        556.0   
3  stress  xgb_os   P19  train        167        NaN          NaN   
4  stress  xgb_os   P11   test        162      452.0        361.0   

   test_inst_1  test_class_ratio  test_true_0_pred_0  ...  train_pre_1  \
0        223.0          1.331839               290.0  ...          NaN   
1          NaN               NaN                 NaN  ...     0.982049   
2         63.0          8.825397               492.0  ...          NaN   
3          NaN               NaN                 NaN  ...     0.982693   
4         91.0          3.967033               339.0  ...          NaN   

   train_rec_1  train_f1_1  train_roauc  train_prauc_0  train_prauc_ref_0  \
0          NaN         NaN          NaN            NaN         

In [8]:
RESULTS_xgbos = RESULTS_EVAL[RESULTS_EVAL['alg']=='xgb_os']

In [9]:
import pandas as pd


SUMMARY_EVAL = []

for row in RESULTS_EVAL.groupby(
#    ['label', 'alg', 'cluster']
     ['label', 'alg']
).agg(summary).reset_index().itertuples():
    for k, v in row._asdict().items():
        if type(v) is dict:
            r = dict(
                label=row.label,
                alg=row.alg,
#                 cluster = row.cluster,
                metric=k,
                **v
            )
            SUMMARY_EVAL.append(r)

SUMMARY_EVAL = pd.DataFrame(SUMMARY_EVAL)    
SUMMARY_EVAL.head()

Unnamed: 0,label,alg,metric,n,cardinality,value_count,sum,mean,SD,med,range,conf.,nan_count
0,stress,xgb_os,split,48,24.0,"P20:2, P19:2, P13:2, P23:2, P27:2, P21:2, P12:...",,,,,,,
1,stress,xgb_os,phase,48,2.0,"test:24, train:24",,,,,,,
2,stress,xgb_os,n_feature,48,,,7754.0,161.541667,2.751853,162.0,"(157, 167)","(160.74261184506273, 162.3407214882706)",0.0
3,stress,xgb_os,test_inst,48,,,13134.0,547.25,133.651477,525.5,"(268.0, nan)","(490.81393070682515, 603.6860692931749)",24.0
4,stress,xgb_os,test_inst_0,48,,,10210.0,425.416667,140.088766,388.0,"(172.0, nan)","(366.26236882270996, 484.5709645106234)",24.0


In [10]:
SUB_SUMMARY_EVAL = SUMMARY_EVAL.loc[
    lambda x: x['metric'].isin(
        ['n_feature','test_acc' ,'test_f1_1',  'test_f1_macro','test_roauc','test_pre_macro','test_rec_macro']
    )
].round(3).assign(
    mean_sd=lambda x: x['mean'].astype(str).str.cat(' (' + x['SD'].astype(str) + ')', sep=''),
).pivot(
    index=['label', 'alg'], columns=['metric'], values=['mean_sd']
)

# separate rows where 'alg' is 'dummy' and 'alg' is not 'dummy'
df_dummy = SUB_SUMMARY_EVAL[SUB_SUMMARY_EVAL.index.get_level_values('alg') == 'dummy']
df_others = SUB_SUMMARY_EVAL[SUB_SUMMARY_EVAL.index.get_level_values('alg') != 'dummy']

# concatenate them ensuring that 'dummy' rows are always at the top for each group
SUB_SUMMARY_EVAL = pd.concat([df_dummy, df_others])

SUB_SUMMARY_EVAL

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_sd,mean_sd,mean_sd,mean_sd,mean_sd,mean_sd,mean_sd
Unnamed: 0_level_1,metric,n_feature,test_acc,test_f1_1,test_f1_macro,test_pre_macro,test_rec_macro,test_roauc
label,alg,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
stress,xgb_os,161.542 (2.752),0.704 (0.124),0.125 (0.083),0.468 (0.047),0.515 (0.032),0.507 (0.026),0.522 (0.057)
