# Threshold Analysis

In [1]:
import os

log_folders = os.listdir('../../results/threshold/2023-10-19/')


In [2]:
# Sample log folder
# [2023-10-19 09:50:30,844][__main__][INFO] - Data: Decision Tree Feature Selection Dataset, Model: random_forest, Target: hf15
# [2023-10-19 09:50:30,958][__main__][INFO] - Model already trained
# [2023-10-19 09:50:31,098][__main__][INFO] - F1 Validation score weighted: 0.9960578118524658
# [2023-10-19 09:50:31,200][__main__][INFO] - F1 Test score weighted: 0.9966257589337173
# [2023-10-19 09:50:31,301][__main__][INFO] - Confusion matrix: [[1627    3]
#  [   8 1622]]

# extract the best configuration that has the lowest false negative rate

import re
import pandas as pd
import numpy as np

def extract_log(log_file):
    with open(log_file, 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = ' '.join(line.split('[INFO]')[1:])
            if 'F1 Test score weighted' in line:
                f1_test = float(re.findall(r'\d+\.\d+', line)[0])
            if 'F1 Validation score weighted' in line:
                f1_val = float(re.findall(r'\d+\.\d+', line)[0])
            if 'Confusion matrix' in line:
                cm = re.findall(r'\d+', line + lines[i+1])
                # print(cm)
                cm = np.array(cm).reshape(2,2).astype(int)
    return f1_test, f1_val, cm

In [3]:
def extract_info_folder(folder):
    settings = folder.split(',')
    dataset = settings[0].split('=')[1].strip()
    model = settings[1].split('=')[1].strip()
    target = settings[2].split('=')[1].strip()
    threshold_1 = settings[3].split('=')[1].strip().split('_')[1]
    threshold_2 = settings[4].split('=')[1].strip().split('_')[1]

    # print everything in new line and left align
    print(f'Dataset: {dataset}')
    print(f'Model: {model}')
    print(f'Target: {target}')
    print(f'Threshold CAHI > {threshold_1}')
    print(f'Threshold CAHI > OAHI * 1/{threshold_2}')


In [5]:
best_false_negative = 1000
for folder in log_folders:
    f1_test, f1_val, cm = extract_log('../../results/threshold/2023-10-19/' + folder + '/find_threshold.log')
    # find the lowest false negative rate
    false_negative = cm[0][1]
    if false_negative < best_false_negative:
        best_false_negative = false_negative
        best_folder = folder
        best_cm = cm
        best_f1_test = f1_test
        best_f1_val = f1_val

In [7]:
extract_info_folder(best_folder)

Dataset: feature_selection_dt
Model: decision_tree
Target: hf15
Threshold CAHI > 4
Threshold CAHI > OAHI * 1/1
