In [1]:
import os
import yaml
import glob
import Levenshtein as lev
from sklearn.metrics import precision_score, recall_score, f1_score

c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
def load_yaml(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

In [3]:
def compare_items(true_items, detected_items):
    TP = FP = FN = 0
    
    for item in detected_items:
        if item in true_items:
            TP += 1
        else:
            FP += 1
    
    for item in true_items:
        if item not in detected_items:
            FN += 1
            
    return TP, FP, FN

In [4]:
def compare_constructs(true_constructs, detected_constructs, max_distance=3):
    true_set = set(true_constructs.values())
    detected_set = set(detected_constructs.values())
    TP = sum(1 for det in detected_set if any(is_similar(det, tru, max_distance) for tru in true_set))
    FP = len(detected_set) - TP
    FN = len(true_set) - TP
    return TP, FP, FN

# maybe remove special characters first

In [5]:
def compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses):
    # Translate hypothesis keys to construct names for true data
    true_hypotheses_translated = {(true_constructs[h['cause']], true_constructs[h['effect']]) for h in true_hypotheses.values()}
    # Translate hypothesis keys to construct names for detected data
    detected_hypotheses_translated = {(detected_constructs[h['cause']], detected_constructs[h['effect']]) for h in detected_hypotheses.values()}

    TP = len(true_hypotheses_translated.intersection(detected_hypotheses_translated))
    FP = len(detected_hypotheses_translated - true_hypotheses_translated)
    FN = len(true_hypotheses_translated - detected_hypotheses_translated)

    return TP, FP, FN

In [6]:
"""
def compare_texts(true_texts, detected_texts, max_distance=3):
    TP = sum(1 for det_text in detected_texts if any(is_similar(det_text, true_text, max_distance) for true_text in true_texts))
    FP = len(detected_texts) - TP
    FN = len(true_texts) - TP
    return TP, FP, FN
"""

'\ndef compare_texts(true_texts, detected_texts, max_distance=3):\n    TP = sum(1 for det_text in detected_texts if any(is_similar(det_text, true_text, max_distance) for true_text in true_texts))\n    FP = len(detected_texts) - TP\n    FN = len(true_texts) - TP\n    return TP, FP, FN\n'

In [7]:
def is_similar(str1, str2, max_distance=3):
    return lev.distance(str1, str2) <= max_distance

In [8]:
def calculate_metrics(TP, FP, FN):
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [9]:
# Load the YAML files
extracted_files = sorted(glob.glob('chatGPT_long/*.yaml'))
extracted_files_v2 = sorted(glob.glob('chatGPT_short/*.yaml'))
ground_truth_files = sorted(glob.glob('true_results/*.yaml'))


In [10]:
ground_truth_files = [element.replace("\\", "/") for element in ground_truth_files]
extracted_files = [element.replace("\\", "/") for element in extracted_files]
extracted_files_v2 = [element.replace("\\", "/") for element in extracted_files_v2]


extracted_files

['chatGPT_long/diagram1.yaml',
 'chatGPT_long/diagram2.yaml',
 'chatGPT_long/diagram28.yaml',
 'chatGPT_long/diagram3.yaml',
 'chatGPT_long/diagram30.yaml',
 'chatGPT_long/diagram4.yaml',
 'chatGPT_long/diagram5.yaml',
 'chatGPT_long/diagram50.yaml',
 'chatGPT_long/diagram6.yaml',
 'chatGPT_long/diagram7.yaml',
 'chatGPT_long/diagram8.yaml',
 'chatGPT_long/diagram87.yaml',
 'chatGPT_long/diagram99.yaml']

In [11]:
"""
ground_truth = []
extracted_data = []
extracted_data_v2 = []


# Load files
for gt_file, ex_file in zip(ground_truth_files, extracted_files):
    ground_truth.append(load_yaml(gt_file))
    extracted_data.append(load_yaml(ex_file))

"""

'\nground_truth = []\nextracted_data = []\nextracted_data_v2 = []\n\n\n# Load files\nfor gt_file, ex_file in zip(ground_truth_files, extracted_files):\n    ground_truth.append(load_yaml(gt_file))\n    extracted_data.append(load_yaml(ex_file))\n\n'

In [12]:
"""
# Initialize counters for constructs and hypotheses
constructs_TP = constructs_FP = constructs_FN = 0
hypotheses_TP = hypotheses_FP = hypotheses_FN = 0

# Process each file
for gt_file, ex_file in zip(ground_truth_files, extracted_files):
    ground_truth = load_yaml(gt_file)
    extracted_data = load_yaml(ex_file)

    # If either file failed to load properly, skip this pair
    if ground_truth is None or extracted_data is None:
        print(f"Error loading files: {gt_file}, {ex_file}")
        continue
    if 'constructs' not in ground_truth or 'constructs' not in extracted_data:
        print(f"Missing 'constructs' in files: {gt_file}, {ex_file}")
        continue
    
    # Compare constructs
    true_constructs = ground_truth.get('constructs', {})
    detected_constructs = extracted_data.get('constructs', {})
    TP, FP, FN = compare_constructs(true_constructs, detected_constructs)
    constructs_TP += TP
    constructs_FP += FP
    constructs_FN += FN

    # Compare hypotheses using the updated function
    true_hypotheses = ground_truth.get('hypotheses', {})
    detected_hypotheses = extracted_data.get('hypotheses', {})
    TP, FP, FN = compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses)
    hypotheses_TP += TP
    hypotheses_FP += FP
    hypotheses_FN += FN

    # Compare texts
    #TP, FP, FN = compare_texts(ground_truth['texts'], extracted_data['texts'])
    #all_TP += TP
    #all_FP += FP
    #all_FN += FN
"""

'\n# Initialize counters for constructs and hypotheses\nconstructs_TP = constructs_FP = constructs_FN = 0\nhypotheses_TP = hypotheses_FP = hypotheses_FN = 0\n\n# Process each file\nfor gt_file, ex_file in zip(ground_truth_files, extracted_files):\n    ground_truth = load_yaml(gt_file)\n    extracted_data = load_yaml(ex_file)\n\n    # If either file failed to load properly, skip this pair\n    if ground_truth is None or extracted_data is None:\n        print(f"Error loading files: {gt_file}, {ex_file}")\n        continue\n    if \'constructs\' not in ground_truth or \'constructs\' not in extracted_data:\n        print(f"Missing \'constructs\' in files: {gt_file}, {ex_file}")\n        continue\n    \n    # Compare constructs\n    true_constructs = ground_truth.get(\'constructs\', {})\n    detected_constructs = extracted_data.get(\'constructs\', {})\n    TP, FP, FN = compare_constructs(true_constructs, detected_constructs)\n    constructs_TP += TP\n    constructs_FP += FP\n    constructs_

In [13]:
# Initialize counters for constructs and hypotheses for both extracted versions
constructs_TP_v1 = constructs_FP_v1 = constructs_FN_v1 = 0
hypotheses_TP_v1 = hypotheses_FP_v1 = hypotheses_FN_v1 = 0

constructs_TP_v2 = constructs_FP_v2 = constructs_FN_v2 = 0
hypotheses_TP_v2 = hypotheses_FP_v2 = hypotheses_FN_v2 = 0

# Process each file for extracted_files and extracted_files_v2
for gt_file, ex_file_v1, ex_file_v2 in zip(ground_truth_files, extracted_files, extracted_files_v2):
    # Process for version 1
    ground_truth = load_yaml(gt_file)
    extracted_data_v1 = load_yaml(ex_file_v1)
    
    # Process for version 2
    extracted_data_v2 = load_yaml(ex_file_v2)
    
    # If either file failed to load properly, skip this pair
    if ground_truth is None:
        print(f"Error loading ground truth file: {gt_file}")
        continue
    
    # Check for constructs and hypotheses in ground truth and extracted data v1
    if 'constructs' not in ground_truth or 'constructs' not in extracted_data_v1:
        print(f"Missing 'constructs' in files: {gt_file}, {ex_file_v1}")
    else:
        # Compare constructs for v1
        true_constructs = ground_truth.get('constructs', {})
        detected_constructs_v1 = extracted_data_v1.get('constructs', {})
        TP, FP, FN = compare_constructs(true_constructs, detected_constructs_v1)
        constructs_TP_v1 += TP
        constructs_FP_v1 += FP
        constructs_FN_v1 += FN

        # Compare hypotheses for v1
        true_hypotheses = ground_truth.get('hypotheses', {})
        detected_hypotheses_v1 = extracted_data_v1.get('hypotheses', {})
        TP, FP, FN = compare_hypotheses(true_constructs, detected_constructs_v1, true_hypotheses, detected_hypotheses_v1)
        hypotheses_TP_v1 += TP
        hypotheses_FP_v1 += FP
        hypotheses_FN_v1 += FN
    
    # Check for constructs and hypotheses in extracted data v2
    if 'constructs' not in extracted_data_v2:
        print(f"Missing 'constructs' in file: {ex_file_v2}")
    else:
        # Compare constructs for v2
        detected_constructs_v2 = extracted_data_v2.get('constructs', {})
        TP, FP, FN = compare_constructs(true_constructs, detected_constructs_v2)
        constructs_TP_v2 += TP
        constructs_FP_v2 += FP
        constructs_FN_v2 += FN

        # Compare hypotheses for v2
        detected_hypotheses_v2 = extracted_data_v2.get('hypotheses', {})
        TP, FP, FN = compare_hypotheses(true_constructs, detected_constructs_v2, true_hypotheses, detected_hypotheses_v2)
        hypotheses_TP_v2 += TP
        hypotheses_FP_v2 += FP
        hypotheses_FN_v2 += FN


In [14]:
# Calculate and print metrics for constructs for the original extracted files
constructs_precision_v1, constructs_recall_v1, constructs_f1_v1 = calculate_metrics(constructs_TP_v1, constructs_FP_v1, constructs_FN_v1)
print(f"Constructs v1 - Precision: {constructs_precision_v1:.2f}, Recall: {constructs_recall_v1:.2f}, F1 Score: {constructs_f1_v1:.2f} (ChatGPT long prompt)")

# Calculate and print metrics for constructs for the extracted_files_v2
constructs_precision_v2, constructs_recall_v2, constructs_f1_v2 = calculate_metrics(constructs_TP_v2, constructs_FP_v2, constructs_FN_v2)
print(f"Constructs v2 - Precision: {constructs_precision_v2:.2f}, Recall: {constructs_recall_v2:.2f}, F1 Score: {constructs_f1_v2:.2f} (ChatGPT short prompt)")

print("Constructs    - Precision: 0.88, Recall: 0.80, F1 Score: 0.82 (Mammoth pipeline)")

print("\n")

# Calculate and print metrics for hypotheses for the original extracted files
hypotheses_precision_v1, hypotheses_recall_v1, hypotheses_f1_v1 = calculate_metrics(hypotheses_TP_v1, hypotheses_FP_v1, hypotheses_FN_v1)
print(f"Hypotheses v1 - Precision: {hypotheses_precision_v1:.2f}, Recall: {hypotheses_recall_v1:.2f}, F1 Score: {hypotheses_f1_v1:.2f} (ChatGPT long prompt)")

# Calculate and print metrics for hypotheses for the extracted_files_v2
hypotheses_precision_v2, hypotheses_recall_v2, hypotheses_f1_v2 = calculate_metrics(hypotheses_TP_v2, hypotheses_FP_v2, hypotheses_FN_v2)
print(f"Hypotheses v2 - Precision: {hypotheses_precision_v2:.2f}, Recall: {hypotheses_recall_v2:.2f}, F1 Score: {hypotheses_f1_v2:.2f} (ChatGPT long prompt)")



Constructs v1 - Precision: 0.90, Recall: 0.85, F1 Score: 0.87 (ChatGPT long prompt)
Constructs v2 - Precision: 1.00, Recall: 0.97, F1 Score: 0.98 (ChatGPT short prompt)
Constructs    - Precision: 0.88, Recall: 0.80, F1 Score: 0.82 (Mammoth pipeline)


Hypotheses v1 - Precision: 0.51, Recall: 0.47, F1 Score: 0.49 (ChatGPT long prompt)
Hypotheses v2 - Precision: 0.60, Recall: 0.50, F1 Score: 0.54 (ChatGPT long prompt)


In [15]:
"""
# Calculate and print metrics for constructs
constructs_precision, constructs_recall, constructs_f1 = calculate_metrics(constructs_TP, constructs_FP, constructs_FN)
print(f"Constructs - Precision: {constructs_precision:.2f}, Recall: {constructs_recall:.2f}, F1 Score: {constructs_f1:.2f} (ChatGPT long prompt)")

print("Constructs - Precision: 0.88, Recall: 0.80, F1 Score: 0.82 (Mammoth pipeline)")

# Calculate and print metrics for hypotheses
#hypotheses_precision, hypotheses_recall, hypotheses_f1 = calculate_metrics(hypotheses_TP, hypotheses_FP, hypotheses_FN)
#print(f"Hypotheses - Precision: {hypotheses_precision:.2f}, Recall: {hypotheses_recall:.2f}, F1 Score: {hypotheses_f1:.2f} (ChatGPT long prompt)")
"""

'\n# Calculate and print metrics for constructs\nconstructs_precision, constructs_recall, constructs_f1 = calculate_metrics(constructs_TP, constructs_FP, constructs_FN)\nprint(f"Constructs - Precision: {constructs_precision:.2f}, Recall: {constructs_recall:.2f}, F1 Score: {constructs_f1:.2f} (ChatGPT long prompt)")\n\nprint("Constructs - Precision: 0.88, Recall: 0.80, F1 Score: 0.82 (Mammoth pipeline)")\n\n# Calculate and print metrics for hypotheses\n#hypotheses_precision, hypotheses_recall, hypotheses_f1 = calculate_metrics(hypotheses_TP, hypotheses_FP, hypotheses_FN)\n#print(f"Hypotheses - Precision: {hypotheses_precision:.2f}, Recall: {hypotheses_recall:.2f}, F1 Score: {hypotheses_f1:.2f} (ChatGPT long prompt)")\n'