In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
import math

In [3]:
optimal = ["species_LENET5_ITS_4mer_32,32,4",
  "species_RESNET8_ITS_4mer_16,64,128,7",
  "species_RESNET10_ITS_4mer_16,64,128,256,7",
  "species_RESNET18_ITS_4mer_32,64,256,7",
  "species_RESNET34_ITS_4mer_64,128,256,7"]

In [8]:
def parse_epoch_file(file_path):
    epoch_data = []
    with open(file_path, 'r') as file:
        current_epoch = None
        current_data = []
        for line in file:
            line = line.strip()
            if line.startswith('----- Epoch'):
                if current_epoch is not None:
                    epoch_data.append(current_data)
                current_epoch = int(line.split(' ')[-2])
                current_data = []
            elif line:
                current_data.append(line)
        if current_epoch is not None:
            epoch_data.append(current_data)
    return epoch_data

In [9]:
def list_to_dataframe(data_list):
    data = [item.split(',') for item in data_list]
    df = pd.DataFrame(data, columns=['true', 'pred'])
    return df

In [32]:
model = optimal[4]
print(model)
num_classes = 3323

directory = 'new_results/' + model
with open(directory + '/maximum_epochs.txt', 'r') as f:
    maximum_epochs = [x.strip() for x in f.readlines()]

precision_list = []
recall_list = []
f1_score_list = []
MCC_score_list = []

from sklearn.metrics import confusion_matrix

for i in range(1, 11):
    target_epoch = int(maximum_epochs[i-1])-1
    if i < 10: x = '0' + str(i)
    else: x = str(i)

    file_path = directory + '/Fold_' + x + '.csv'
    epoch_data = parse_epoch_file(file_path)[target_epoch]
    
    dfData = list_to_dataframe(epoch_data)
    labels = [int(x) for x in dfData['true'].values]
    preds = [int(x) for x in dfData['pred'].values]

    conf_matrix = pd.DataFrame(0, index=np.arange(num_classes), columns=np.arange(num_classes))

    for label, pred in zip(labels, preds):
        conf_matrix.at[label, pred] += 1

    precision_dict = {}
    recall_dict = {}
    f1_score_dict = {}
    MCC_score_dict = {}
    
    for i in tqdm(range(num_classes)):
        TP = conf_matrix.iloc[i, i]
        FP = conf_matrix.iloc[:, i].sum() - TP
        FN = conf_matrix.iloc[i, :].sum() - TP
        TN = conf_matrix.values.sum() - (TP + FP + FN)
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        numerator = (TP * TN) - (FP * FN)
        denominator = (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)
        if denominator > 0:
            denominator = math.sqrt(denominator)
            MCC_score = numerator / denominator
        else: MCC_score = 0
        
        precision_dict[i] = precision
        recall_dict[i] = recall
        f1_score_dict[i] = f1_score
        MCC_score_dict[i] = MCC_score
    
    precision = sum(precision_dict.values()) / len(precision_dict)
    recall = sum(recall_dict.values()) / len(recall_dict)
    f1_score = sum(f1_score_dict.values()) / len(f1_score_dict)
    MCC_score = sum(MCC_score_dict.values()) / len(MCC_score_dict)
    
    precision_list.append(str(precision))
    recall_list.append(str(recall))
    f1_score_list.append(str(f1_score))
    MCC_score_list.append(str(MCC_score))

species_RESNET34_ITS_4mer_64,128,256,7


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 291.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 293.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 296.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 295.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3

In [33]:
print(','.join(precision_list))
print(np.mean([float(x) for x in precision_list]))

0.8673645443732695,0.8660504707450227,0.8640635971511678,0.857900204920968,0.8696057779115246,0.8676296519209524,0.8565985507454044,0.866516916727569,0.864088674892164,0.8736490262671406
0.8653467415655183


In [34]:
print(','.join(recall_list))
print(np.mean([float(x) for x in recall_list]))

0.8600662052362323,0.8564550105326513,0.8561540776406861,0.8496840204634366,0.8566054769786338,0.8563045440866687,0.8523924164911225,0.8591634065603371,0.8540475473969305,0.8623232019259706
0.856319590731267


In [35]:
print(','.join(f1_score_list))
print(np.mean([float(x) for x in f1_score_list]))

0.8489722902903778,0.8455975428590535,0.8445423670521479,0.8377940663676444,0.8467936316868012,0.8460613616496857,0.8400277005904456,0.8477305838575775,0.8428750555292841,0.8517021815246312
0.8452096781407649


In [36]:
print(','.join(MCC_score_list))
print(np.mean([float(x) for x in MCC_score_list]))

0.8561194931680921,0.8531832639844275,0.8520806867951918,0.8455404708104495,0.8546993389480475,0.8537722583272548,0.8470326517432116,0.8550559977899704,0.8507238184470445,0.8595889821703147
0.8527796962184004


In [163]:
precision_list = []
recall_list = []
f1_score_list = []
MCC_score_list = []

for i in range(1, 11):
    if i < 10: x = '0' + str(i)
    else: x = str(i)

    dfData = pd.read_csv(f'results/species/GAUSSIAN_ITS_4mer/Fold_{x}.csv', names=['true','pred'])
    labels = [int(x) for x in dfData['true'].values]
    preds = [int(x) for x in dfData['pred'].values]

    conf_matrix = pd.DataFrame(0, index=np.arange(3323), columns=np.arange(3323))

    for label, pred in zip(labels, preds):
        conf_matrix.at[label, pred] += 1
    
    precision_dict = {}
    recall_dict = {}
    f1_score_dict = {}
    MCC_score_dict = {}
    
    for i in tqdm(range(num_classes)):
        TP = conf_matrix.iloc[i, i]
        FP = conf_matrix.iloc[:, i].sum() - TP
        FN = conf_matrix.iloc[i, :].sum() - TP
        TN = conf_matrix.values.sum() - (TP + FP + FN)
        
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        numerator = (TP * TN) - (FP * FN)
        denominator = (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)
        if denominator > 0:
            denominator = math.sqrt(denominator)
            MCC_score = numerator / denominator
        else: MCC_score = 0
        
        precision_dict[i] = precision
        recall_dict[i] = recall
        f1_score_dict[i] = f1_score
        MCC_score_dict[i] = MCC_score
    
    precision = sum(precision_dict.values()) / len(precision_dict)
    recall = sum(recall_dict.values()) / len(recall_dict)
    f1_score = sum(f1_score_dict.values()) / len(f1_score_dict)
    MCC_score = sum(MCC_score_dict.values()) / len(MCC_score_dict)
    
    precision_list.append(str(precision))
    recall_list.append(str(recall))
    f1_score_list.append(str(f1_score))
    MCC_score_list.append(str(MCC_score))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 293.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 298.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 293.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3323/3323 [00:11<00:00, 294.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3

In [164]:
print(','.join(precision_list))
print(np.mean([float(x) for x in precision_list]))

0.8139550607903082,0.8075636597245103,0.8120965905439751,0.8180561766262209,0.8170846451362112,0.8189127020298139,0.8113797880496227,0.8136852963773193,0.8206412999809288,0.8121956384757092
0.8145570857734621


In [165]:
print(','.join(recall_list))
print(np.mean([float(x) for x in recall_list]))

0.777008727053867,0.7746012639181462,0.7709900692145651,0.7794161901895877,0.7755040625940415,0.7801685224195004,0.778663857959675,0.775353596148059,0.777008727053867,0.7762563948239543
0.7764971411375263


In [166]:
print(','.join(f1_score_list))
print(np.mean([float(x) for x in f1_score_list]))

0.7735162831821163,0.7695511209609259,0.7688204532275509,0.7766824090844043,0.7739320938758089,0.7773980927565606,0.7731757685125231,0.7727188447287127,0.776140033961633,0.7723728945656247
0.7734307994855861


In [167]:
print(','.join(MCC_score_list))
print(np.mean([float(x) for x in MCC_score_list]))

0.7839820769249282,0.7798404789378772,0.7796473424660615,0.7872150504107457,0.7846364350564675,0.7879672677390508,0.7836058927395319,0.7831385956855299,0.7869689564201011,0.7828355804192377
0.7839837676799531
