In [3]:
import os
import yaml
import glob
import Levenshtein as lev
from sklearn.metrics import precision_score, recall_score, f1_score

c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [4]:
def load_yaml(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

In [5]:
def compare_items(true_items, detected_items):
    TP = FP = FN = 0
    
    for item in detected_items:
        if item in true_items:
            TP += 1
        else:
            FP += 1
    
    for item in true_items:
        if item not in detected_items:
            FN += 1
            
    return TP, FP, FN

In [6]:
def compare_constructs(true_constructs, detected_constructs, max_distance=3):
    true_set = set(true_constructs.values())
    detected_set = set(detected_constructs.values())
    TP = sum(1 for det in detected_set if any(is_similar(det, tru, max_distance) for tru in true_set))
    FP = len(detected_set) - TP
    FN = len(true_set) - TP
    return TP, FP, FN



In [13]:
def compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses):
    # Translate hypothesis keys to construct names for true data
    true_hypotheses_translated = {(true_constructs[h['cause']], true_constructs[h['effect']]) for h in true_hypotheses.values()}

    # Translate hypothesis keys to construct names for detected data
    detected_hypotheses_translated = set()
    for h in detected_hypotheses.values():
        cause = h.get('cause')
        effect = h.get('effect')
        if cause in detected_constructs and effect in detected_constructs:
            detected_hypotheses_translated.add((detected_constructs[cause], detected_constructs[effect]))

    TP = len(true_hypotheses_translated.intersection(detected_hypotheses_translated))
    FP = len(detected_hypotheses_translated - true_hypotheses_translated)
    FN = len(true_hypotheses_translated - detected_hypotheses_translated)

    # Evaluate label correctness for TP hypotheses
    correct_labels_count = 0
    for h in true_hypotheses.values():
        if (true_constructs[h['cause']], true_constructs[h['effect']]) in detected_hypotheses_translated:
            detected_hypothesis = next((dh for dh in detected_hypotheses.values() if dh['cause'] == h['cause'] and dh['effect'] == h['effect']), None)
            if detected_hypothesis and is_similar(h['label'], detected_hypothesis['label'], max_distance=2):
                correct_labels_count += 1

    return TP, FP, FN, correct_labels_count

In [8]:
def is_similar(str1, str2, max_distance=3):
    return lev.distance(str1, str2) <= max_distance

In [9]:
def calculate_metrics(TP, FP, FN):
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [17]:
def calculate_label_metrics(correct_hypotheses_with_labels):
    TP = sum(is_similar(gt_label, ex_label, max_distance=2) for gt_label, ex_label in correct_hypotheses_with_labels if gt_label)
    
    FP_mismatched_label = sum(not is_similar(gt_label, ex_label, max_distance=2) for gt_label, ex_label in correct_hypotheses_with_labels if gt_label and ex_label)
    FP_no_ground_truth_label = sum(1 for gt_label, ex_label in correct_hypotheses_with_labels if not gt_label and ex_label)
    FP = FP_mismatched_label + FP_no_ground_truth_label
    
    FN = sum(1 for gt_label, ex_label in correct_hypotheses_with_labels if gt_label and not ex_label)

    return calculate_metrics(TP, FP, FN)

In [11]:
# Load the YAML files
"""in the output: V1, V2, V3, ..."""
extracted_files_sets = [ 
    sorted(glob.glob('../chatGPT_short/*.yaml')),
    sorted(glob.glob('../chatGPT_long/*.yaml')),
    sorted(glob.glob('../chatGPT_YAML_JSON/*.yaml'))
]
ground_truth_files = sorted(glob.glob('../true_results/*.yaml'))

# Function to extract filename without extension
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

# Function to find matching ground truth file
def find_matching_gt_file(extracted_file, ground_truth_files):
    extracted_filename = get_filename_without_extension(extracted_file)
    for gt_file in ground_truth_files:
        if get_filename_without_extension(gt_file) == extracted_filename:
            return gt_file
    return None

# Function compare extracted files to the ground truth files
def process_extracted_files(extracted_files, ground_truth_files):
    constructs_TP = constructs_FP = constructs_FN = 0
    hypotheses_TP = hypotheses_FP = hypotheses_FN = 0
    correct_labels_count = 0
    correct_hypotheses_with_labels = []

    for ex_file in extracted_files:
        gt_file = find_matching_gt_file(ex_file, ground_truth_files)
        if not gt_file:
            print(f"No matching ground truth file for {ex_file}")
            continue

        ground_truth = load_yaml(gt_file)
        extracted_data = load_yaml(ex_file)

        # Check for constructs in ground truth and extracted data
        if 'constructs' in ground_truth and 'constructs' in extracted_data:
            true_constructs = ground_truth['constructs']
            detected_constructs = extracted_data['constructs']
            TP, FP, FN = compare_constructs(true_constructs, detected_constructs)
            constructs_TP += TP
            constructs_FP += FP
            constructs_FN += FN

        # Check for hypotheses in ground truth and extracted data
        if 'hypotheses' in ground_truth and 'hypotheses' in extracted_data:
            true_hypotheses = ground_truth['hypotheses']
            detected_hypotheses = extracted_data['hypotheses']
            TP, FP, FN, labels_count = compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses)
            hypotheses_TP += TP
            hypotheses_FP += FP
            hypotheses_FN += FN
            correct_labels_count += labels_count

            for hypothesis_id in true_hypotheses:
                if hypothesis_id in detected_hypotheses:
                    gt_label = true_hypotheses[hypothesis_id]['label']
                    ex_label = detected_hypotheses[hypothesis_id]['label']
                    correct_hypotheses_with_labels.append((gt_label, ex_label))

    return constructs_TP, constructs_FP, constructs_FN, hypotheses_TP, hypotheses_FP, hypotheses_FN, correct_labels_count, correct_hypotheses_with_labels


In [21]:
# Process each set of extracted files
for idx, extracted_files in enumerate(extracted_files_sets):
    constructs_TP, constructs_FP, constructs_FN, hypotheses_TP, hypotheses_FP, hypotheses_FN, correct_labels_count, correct_hypotheses_with_labels = process_extracted_files(extracted_files, ground_truth_files)
    
    # Calculate and print metrics for constructs
    constructs_precision, constructs_recall, constructs_f1 = calculate_metrics(constructs_TP, constructs_FP, constructs_FN)
    print(f"Constructs v{idx+1} - Precision: {constructs_precision:.2f}, Recall: {constructs_recall:.2f}, F1 Score: {constructs_f1:.2f}")

    # Calculate and print metrics for hypotheses
    hypotheses_precision, hypotheses_recall, hypotheses_f1 = calculate_metrics(hypotheses_TP, hypotheses_FP, hypotheses_FN)
    print(f"Links      v{idx+1} - Precision: {hypotheses_precision:.2f}, Recall: {hypotheses_recall:.2f}, F1 Score: {hypotheses_f1:.2f}")

    # Calculate and print metrics for label accuracy
    label_precision, label_recall, label_f1 = calculate_label_metrics(correct_hypotheses_with_labels)
    print(f"Labels     v{idx+1} - Precision: {label_precision:.2f}, Recall: {label_recall:.2f}, F1 Score: {label_f1:.2f}\n")

print("\nConstructs gt - Precision: 0.88, Recall: 0.80, F1 Score: 0.82   (Mammoth pipeline)")
print("Labels     gt - Accuracy: 0.72")

Constructs v1 - Precision: 0.94, Recall: 0.90, F1 Score: 0.92
Links      v1 - Precision: 0.56, Recall: 0.49, F1 Score: 0.52
Labels     v1 - Precision: 0.83, Recall: 0.96, F1 Score: 0.89

Constructs v2 - Precision: 0.91, Recall: 0.87, F1 Score: 0.89
Links      v2 - Precision: 0.55, Recall: 0.51, F1 Score: 0.53
Labels     v2 - Precision: 0.72, Recall: 1.00, F1 Score: 0.84

Constructs v3 - Precision: 0.96, Recall: 0.98, F1 Score: 0.97
Links      v3 - Precision: 0.72, Recall: 0.65, F1 Score: 0.68
Labels     v3 - Precision: 0.73, Recall: 0.96, F1 Score: 0.83


Constructs gt - Precision: 0.88, Recall: 0.80, F1 Score: 0.82   (Mammoth pipeline)
Labels     gt - Accuracy: 0.72
