# Evaluation scores



example: <br>
higher log probability ==> predicted class
if [score 0]=-2 and [score 1]=-3 then it means that the predicted class is class 0 <br>
[score] = -2+3 = 1 > 0 --> class 0
<br>
###########
<br>
if [score 0]=-3 and [score 1]=-2 then it means that the predicted class is class 1 <br>
[score] = -3+2 = -1 < 0 --> class 1

***
## Load the scores

In [18]:
import csv
import pandas as pd
from src.utils import *
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [4]:
# paths to csv score files
score_clean_csv = '../eval/scores_resnet_spec_eval.csv'
score_2dot0_csv = '../eval/scores_resnet_spec_eval_FGSM_2dot0.csv' 

scores_clean = pd.read_csv(score_clean_csv, delimiter=' ', header=None, engine='python')
scores_2dot0 = pd.read_csv(score_2dot0_csv, delimiter=' ', header=None, engine='python')

In [5]:
# ground truth file of evaluation dataset (ASVSpoof2019)
config_path = '../config/residualnet_train_config.yaml'
config = read_yaml(config_path)
df_eval = pd.read_csv(os.path.join('..', config['df_eval_path']))

In [6]:
print(f'The length of the evaluation file list is {len(df_eval)}\n'
    f'The length of the clean score list is {len(scores_clean)}\n'
    f'The length of the epsilon=2.0 score list is {len(scores_2dot0)}')

The length of the evaluation file list is 71237
The length of the clean score list is 71237
The length of the epsilon=2.0 score list is 71237


In [8]:
# the score values .csv uses a space as a delimiter between the file path and the score

def convert_column_to_binary(csv_file):
    binary_list = []
    
    with open(csv_file, 'r') as file:
        for line in file:
            # Split each line based on space 
            parts = line.strip().split()
            value = float(parts[1])
            
            try:
                if value > 0:
                    binary_list.append(0)
                else:
                    binary_list.append(1)
            except ValueError:
                pass
        
    return binary_list

In [10]:
pred_labels_clean = convert_column_to_binary(csv_file=score_clean_csv)
pred_labels_2dot0 = convert_column_to_binary(csv_file=score_2dot0_csv)

In [16]:
# get the GT labels
GT_labels = df_eval.iloc[:, -1].tolist()

***
## Unbalanced accuracy

In [22]:
unb_acc_clean = accuracy_score(y_true=GT_labels, y_pred=pred_labels_clean)
unb_acc_2dot0 = accuracy_score(y_true=GT_labels, y_pred=pred_labels_2dot0)

print(f'The unbalanced acc for clean eval set is {unb_acc_clean*100:.2f}%\n'
      f'The unblanaced acc for epsilon = 2.0 set is {unb_acc_2dot0*100:.2f}%')


The unbalanced acc for clean eval set is 77.15%
The unblanaced acc for epsilon = 2.0 set is 36.29%


## Balanced accuracy

In [23]:
bal_acc_clean = balanced_accuracy_score(y_true=GT_labels, y_pred=pred_labels_clean)
bal_acc_2dot0 = balanced_accuracy_score(y_true=GT_labels, y_pred=pred_labels_2dot0)

print(f'The balanced acc for clean eval set is {bal_acc_clean*100:.2f}%\n'
      f'The blanaced acc for epsilon = 2.0 set is {bal_acc_2dot0*100:.2f}%')

The balanced acc for clean eval set is 84.21%
The blanaced acc for epsilon = 2.0 set is 49.97%


## Computing the ROC and EER

In [9]:
from sklearn.metrics import roc_curve
import numpy as np

In [10]:
def compute_eer(y_true, y_score):
    # compute the ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    
    # find threshold
    eer_threshold_idx = np.argmin(np.abs(fpr-(1-tpr)))
    eer_threshold = thresholds[eer_threshold_idx]
    
    # Calculate EER (FPR or TPR at the EER threshold)
    eer_fpr = fpr[eer_threshold_idx]
    eer_tpr = tpr[eer_threshold_idx]
    eer = (eer_fpr + (1 - eer_tpr)) / 2.0
    
    return eer, eer_threshold

In [11]:
eer, eer_threshold = compute_eer(true_labels, pred_labels)

In [12]:
eer

0.15788922284503523

## Computing the confusion matrix
* TN true negative, actual class was 0 (BF) and predicted as 0
* FP false positive, actual class was 0, but predicted as 1 (deep fake)
* FN false negative, actual class was 1, but predicted as 0
* TP true positive, actual class was 1 and predicted as 1

In [13]:
from sklearn.metrics import confusion_matrix

In [14]:
cm = confusion_matrix(true_labels, pred_labels)
print(cm)

# [ TN    FP
#   FN    TP ]

[[ 6848   507]
 [15769 48113]]


## Computing the indices of true positives
aka those audios with GT=1 and prediction=1, which means they are DF and the model correctly identified them as DF


In [26]:
def find_TP(list1, list2):
    if len(list1) != len(list2):
        raise ValueError
    list = []
    for i in range(len(list1)):
        if (list1[i]==1 & list2[i]==1):
            list.append(i)
    return list    

TP_indices = find_TP(true_labels, pred_labels)
TP_indices[:10]

[1, 2, 3, 4, 6, 8, 11, 13, 16, 18]

In [24]:
len(TP_indices)

48113