In [1]:
import os
import yaml
import glob
import Levenshtein as lev
import pandas as pd
import plotly.express as px
from sklearn.metrics import precision_score, recall_score, f1_score


c:\Python311\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
def load_yaml(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

In [3]:
def compare_constructs(true_constructs, detected_constructs, max_distance=0):
    true_set = set(true_constructs.values())
    detected_set = set(detected_constructs.values())
    TP = sum(1 for det in detected_set if any(is_similar(det, tru, max_distance) for tru in true_set))
    FP = len(detected_set) - TP
    FN = len(true_set) - TP
    return TP, FP, FN

In [4]:
def compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses):
    # Translate hypothesis keys to construct names for true data
    true_hypotheses_translated = {(true_constructs[h['cause']], true_constructs[h['effect']]) for h in true_hypotheses.values()}

    # Translate hypothesis keys to construct names for detected data
    detected_hypotheses_translated = set()
    for h in detected_hypotheses.values():
        cause = h.get('cause')
        effect = h.get('effect')
        if cause in detected_constructs and effect in detected_constructs:
            detected_hypotheses_translated.add((detected_constructs[cause], detected_constructs[effect]))

    TP = len(true_hypotheses_translated.intersection(detected_hypotheses_translated))
    FP = len(detected_hypotheses_translated - true_hypotheses_translated)
    FN = len(true_hypotheses_translated - detected_hypotheses_translated)

    # Evaluate label correctness for TP hypotheses
    correct_labels_count = 0
    for h in true_hypotheses.values():
        if (true_constructs[h['cause']], true_constructs[h['effect']]) in detected_hypotheses_translated:
            detected_hypothesis = next((dh for dh in detected_hypotheses.values() if dh['cause'] == h['cause'] and dh['effect'] == h['effect']), None)
            if detected_hypothesis and is_similar(h['label'], detected_hypothesis['label'], max_distance=1):
                correct_labels_count += 1

    return TP, FP, FN, correct_labels_count

In [5]:
def is_similar(str1, str2, max_distance=1):
    # Check if detected is a substring of true
    if str1 in str2:
        return True

    # If not a substring, check Levenshtein distance
    return lev.distance(str1, str2) <= max_distance

In [6]:
def calculate_metrics(TP, FP, FN):
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1, TP, FP, FN

In [7]:
# gt_label == ground truth label
# ex_label == extracted label
def calculate_label_metrics(correct_hypotheses_with_labels):
    TP = sum(is_similar(gt_label, ex_label, max_distance=1) for gt_label, ex_label in correct_hypotheses_with_labels if gt_label)
    
    FP_mismatched_label = sum(not is_similar(gt_label, ex_label, max_distance=1) for gt_label, ex_label in correct_hypotheses_with_labels if gt_label and ex_label)
    FP_no_ground_truth_label = sum(1 for gt_label, ex_label in correct_hypotheses_with_labels if not gt_label and ex_label)
    FP = FP_mismatched_label + FP_no_ground_truth_label
    
    FN = sum(1 for gt_label, ex_label in correct_hypotheses_with_labels if gt_label and not ex_label)

    return calculate_metrics(TP, FP, FN)

In [8]:
# Load the YAML files sets
"""in the output: V1, V2, V3, ..."""
extracted_files_sets = [ 
    sorted(glob.glob('../chatGPT_short/*.yaml')),
    sorted(glob.glob('../chatGPT_long/*.yaml')),
    sorted(glob.glob('../chatGPT_YAML_JSON/*.yaml')),
    sorted(glob.glob('../chatGPT_fewshot5_yaml/*.yaml')),
    sorted(glob.glob('../chatGPT_fewshot5_YAML_JSON/*.yaml'))
]
ground_truth_files = sorted(glob.glob('../true_results/*.yaml'))

# Function to extract filename without extension
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

# Function to find matching ground truth file
def find_matching_gt_file(extracted_file, ground_truth_files):
    extracted_filename = get_filename_without_extension(extracted_file)
    for gt_file in ground_truth_files:
        if get_filename_without_extension(gt_file) == extracted_filename:
            return gt_file
    return None

# Function compare extracted files to the ground truth files
def process_extracted_files(extracted_files, ground_truth_files):
    constructs_TP = constructs_FP = constructs_FN = 0
    hypotheses_TP = hypotheses_FP = hypotheses_FN = 0
    correct_labels_count = 0
    correct_hypotheses_with_labels = []

    for ex_file in extracted_files:
        gt_file = find_matching_gt_file(ex_file, ground_truth_files)
        if not gt_file:
            print(f"No matching ground truth file for {ex_file}")
            continue

        ground_truth = load_yaml(gt_file)
        extracted_data = load_yaml(ex_file)

        # Check for constructs in ground truth and extracted data
        if 'constructs' in ground_truth and 'constructs' in extracted_data:
            true_constructs = ground_truth['constructs']
            detected_constructs = extracted_data['constructs']
            TP, FP, FN = compare_constructs(true_constructs, detected_constructs)
            constructs_TP += TP
            constructs_FP += FP
            constructs_FN += FN

        # Check for hypotheses in ground truth and extracted data
        if 'hypotheses' in ground_truth and 'hypotheses' in extracted_data:
            true_hypotheses = ground_truth['hypotheses']
            detected_hypotheses = extracted_data['hypotheses']
            TP, FP, FN, labels_count = compare_hypotheses(true_constructs, detected_constructs, true_hypotheses, detected_hypotheses)
            hypotheses_TP += TP
            hypotheses_FP += FP
            hypotheses_FN += FN
            correct_labels_count += labels_count

            for hypothesis_id in true_hypotheses:
                if hypothesis_id in detected_hypotheses:
                    gt_label = true_hypotheses[hypothesis_id]['label']
                    ex_label = detected_hypotheses[hypothesis_id]['label']
                    correct_hypotheses_with_labels.append((gt_label, ex_label))

    return constructs_TP, constructs_FP, constructs_FN, hypotheses_TP, hypotheses_FP, hypotheses_FN, correct_labels_count, correct_hypotheses_with_labels


In [11]:
# Initialize lists to store metrics for visualization
constructs_metrics = []
links_metrics = []
labels_metrics = []

# Process each set of extracted files
for idx, extracted_files in enumerate(extracted_files_sets):
    constructs_TP, constructs_FP, constructs_FN, hypotheses_TP, hypotheses_FP, hypotheses_FN, correct_labels_count, correct_hypotheses_with_labels = process_extracted_files(extracted_files, ground_truth_files)
    
    # Calculate and print metrics for constructs
    constructs_precision, constructs_recall, constructs_f1, new_TP, new_FP, new_FN = calculate_metrics(constructs_TP, constructs_FP, constructs_FN)
    print(f"Constructs v{idx+1} - Precision: {constructs_precision:.2f}, Recall: {constructs_recall:.2f}, F1 Score: {constructs_f1:.2f}")

    # Calculate and print metrics for hypotheses
    hypotheses_precision, hypotheses_recall, hypotheses_f1, tmp_TP, tmp_FP, tmp_FN = calculate_metrics(hypotheses_TP, hypotheses_FP, hypotheses_FN)
    print(f"Links      v{idx+1} - Precision: {hypotheses_precision:.2f}, Recall: {hypotheses_recall:.2f}, F1 Score: {hypotheses_f1:.2f}")

    # Calculate and print metrics for label accuracy
    label_precision, label_recall, label_f1, label_TP, label_FP, label_FN= calculate_label_metrics(correct_hypotheses_with_labels)
    print(f"Labels     v{idx+1} - Precision: {label_precision:.2f}, Recall: {label_recall:.2f}, F1 Score: {label_f1:.2f}")

    print(f"construct_TP:  {constructs_TP}, constructs_FP: {constructs_FP}, constructs_FN: {constructs_FN}")
    print(f"hypotheses_TP: {hypotheses_TP}, hypotheses_FP: {hypotheses_FP}, hypotheses_FN: {hypotheses_FN}")
    print(f"label_TP:      {label_TP}, label_FP:      {label_FP}, label_FN:      {label_FN}\n")

    # Append constructs metrics
    constructs_metrics.append({'Version': f'{idx+1}', 'Metric': 'Precision', 'Value': constructs_precision, 'Category': 'Constructs'})
    constructs_metrics.append({'Version': f'{idx+1}', 'Metric': 'Recall', 'Value': constructs_recall, 'Category': 'Constructs'})
    constructs_metrics.append({'Version': f'{idx+1}', 'Metric': 'F1 Score', 'Value': constructs_f1, 'Category': 'Constructs'})

    # Append links metrics
    links_metrics.append({'Version': f'{idx+1}', 'Metric': 'Precision', 'Value': hypotheses_precision, 'Category': 'Links'})
    links_metrics.append({'Version': f'{idx+1}', 'Metric': 'Recall', 'Value': hypotheses_recall, 'Category': 'Links'})
    links_metrics.append({'Version': f'{idx+1}', 'Metric': 'F1 Score', 'Value': hypotheses_f1, 'Category': 'Links'})

    # Append labels metrics
    labels_metrics.append({'Version': f'{idx+1}', 'Metric': 'Precision', 'Value': label_precision, 'Category': 'Labels'})
    labels_metrics.append({'Version': f'{idx+1}', 'Metric': 'Recall', 'Value': label_recall, 'Category': 'Labels'})
    labels_metrics.append({'Version': f'{idx+1}', 'Metric': 'F1 Score', 'Value': label_f1, 'Category': 'Labels'})

# Combine all metrics into a single DataFrame
all_metrics = pd.DataFrame(constructs_metrics + links_metrics + labels_metrics)


print("\nConstructs gt - Precision: 0.88, Recall: 0.80, F1 Score: 0.82   (Mammoth pipeline)")
print("Labels     gt - Accuracy:  0.72")

Constructs v1 - Precision: 0.89, Recall: 0.85, F1 Score: 0.87
Links      v1 - Precision: 0.56, Recall: 0.49, F1 Score: 0.52
Labels     v1 - Precision: 0.74, Recall: 0.96, F1 Score: 0.84
construct_TP:  178, constructs_FP: 22, constructs_FN: 31
hypotheses_TP: 94, hypotheses_FP: 75, hypotheses_FN: 96
label_TP:      96, label_FP:      33, label_FN:      4

Constructs v2 - Precision: 0.85, Recall: 0.81, F1 Score: 0.83
Links      v2 - Precision: 0.55, Recall: 0.51, F1 Score: 0.53
Labels     v2 - Precision: 0.66, Recall: 1.00, F1 Score: 0.80
construct_TP:  180, constructs_FP: 32, constructs_FN: 43
hypotheses_TP: 101, hypotheses_FP: 83, hypotheses_FN: 96
label_TP:      76, label_FP:      39, label_FN:      0

Constructs v3 - Precision: 0.89, Recall: 0.90, F1 Score: 0.90
Links      v3 - Precision: 0.72, Recall: 0.65, F1 Score: 0.68
Labels     v3 - Precision: 0.70, Recall: 0.96, F1 Score: 0.81
construct_TP:  189, constructs_FP: 23, constructs_FN: 20
hypotheses_TP: 132, hypotheses_FP: 52, hypothe

In [18]:
# Create the plot with updated DataFrame
fig = px.bar(all_metrics, x="Version", y="Value", color="Category",
             barmode='group', facet_col="Metric",
             labels={'Metric':''},  # This renames the labels to be empty
             category_orders={"Metric": ["Precision", "Recall", "F1 Score"]},  # This ensures the order of metrics
             title="Metrics Comparison across Datasets")

# Remove 'Metric=' from facet titles
for annotation in fig.layout.annotations:
    annotation.text = annotation.text.split('=')[1]

# Clear titles for all x-axes
for axis in fig.layout:
    if axis.startswith('xaxis'):
        fig.layout[axis].title.text = ''

# Set 'Datasets' label only on the second x-axis
fig.layout['xaxis2'].title.text = 'Datasets'

# Show the figure
fig.show()

In [24]:
# Function to create a grouped bar chart with values displayed for a given category
def create_grouped_bar_chart_with_values(df, category):
    category_df = df[df['Category'] == category]
    fig = px.bar(category_df, x="Version", y="Value", color="Metric",
                 barmode='group',  # This specifies that we want grouped bars, not stacked
                 text="Value",  # Display the value on the bar
                 title=f"{category} Metrics per Dataset",
                 labels={'Metric':''})  # Renames the Metric label to remove 'Metric='
    
    # Update the layout to display the text on the bars
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(xaxis_title="Datasets", yaxis_title="Value", uniformtext_minsize=8, uniformtext_mode='hide')
    
    return fig

# Create plots for Constructs, Links, and Labels
constructs_fig = create_grouped_bar_chart_with_values(all_metrics, 'Constructs')
links_fig = create_grouped_bar_chart_with_values(all_metrics, 'Links')
labels_fig = create_grouped_bar_chart_with_values(all_metrics, 'Labels')

# Display the plots
constructs_fig.show()
links_fig.show()
labels_fig.show()

In [27]:
# Function to create a grouped bar chart for a given metric
def create_grouped_bar_chart_by_metric(df, metric):
    metric_df = df[df['Metric'] == metric]
    fig = px.bar(metric_df, x="Version", y="Value", color="Category",
                 barmode='group',
                 text="Value",
                 title=f"{metric} per Dataset",
                 labels={'Category':''})
    
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(xaxis_title="Datasets", yaxis_title="Value", uniformtext_minsize=8, uniformtext_mode='hide')
    
    return fig

# Create and show the figures for Precision, Recall, and F1-Score
precision_fig = create_grouped_bar_chart_by_metric(all_metrics, 'Precision')
precision_fig.show()

recall_fig = create_grouped_bar_chart_by_metric(all_metrics, 'Recall')
recall_fig.show()

f1_score_fig = create_grouped_bar_chart_by_metric(all_metrics, 'F1 Score')
f1_score_fig.show()