In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# List of pipeline result files
pipeline_files = [
    "grouped_result.csv",
    "grouped_result.csv",
    "grouped_result.csv"
]
pipeline_names = ["Pipeline 1", "Pipeline 2", "Pipeline 3"]

# Initialize metrics storage
pipeline_metrics = []

# Evaluation functions
def precision_at_sentence_level(predicted, ground_truth):
    if len(predicted) == 0:
        return 0 if len(ground_truth) > 0 else 1
    tp = len(set(predicted) & set(ground_truth))
    return tp / len(predicted)

def recall_at_sentence_level(predicted, ground_truth):
    if len(ground_truth) == 0:
        return 1 if len(predicted) == 0 else 0
    tp = len(set(predicted) & set(ground_truth))
    return tp / len(ground_truth)

def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def sentence_level_accuracy(predictions, ground_truths):
    correct = 0
    for pred, truth in zip(predictions, ground_truths):
        if set(pred) == set(truth):
            correct += 1
    return correct / len(ground_truths)

# Process each pipeline
for file, pipeline_name in zip(pipeline_files, pipeline_names):
    # Load the pipeline results
    pipeline_df = pd.read_csv(file)
    grouped_result_df = pd.read_csv("grouped_result.csv")  # Ground truth file

    # Ensure proper formatting
    pipeline_df["predicted_guidelines"] = pipeline_df["predicted_guidelines"].apply(eval)
    grouped_result_df["Ausschreibungskriterium"] = grouped_result_df["Ausschreibungskriterium"].apply(eval)

    # Initialize metrics
    precisions, recalls, f1_scores, accuracies = [], [], [], []

    # Calculate metrics for each sentence
    for _, row in grouped_result_df.iterrows():
        sentence = row["sentence"]
        ground_truth = row["Ausschreibungskriterium"]

        # Match prediction
        predicted = pipeline_df.loc[pipeline_df["sentence"] == sentence, "predicted_guidelines"].values
        predicted = predicted[0] if len(predicted) > 0 else []

        precision = precision_at_sentence_level(predicted, ground_truth)
        recall = recall_at_sentence_level(predicted, ground_truth)
        f1 = f1_score(precision, recall)
        accuracy = 1 if set(predicted) == set(ground_truth) else 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)

    # Aggregate metrics
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = f1_score(avg_precision, avg_recall)
    avg_accuracy = sum(accuracies) / len(accuracies)

    # Store pipeline metrics
    pipeline_metrics.append({
        "Pipeline": pipeline_name,
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F1-Score": avg_f1,
        "Accuracy": avg_accuracy
    })

# Convert metrics to a DataFrame
metrics_df = pd.DataFrame(pipeline_metrics)

# Visualization
metrics_long = metrics_df.melt(id_vars="Pipeline", var_name="Metric", value_name="Score")
plt.figure(figsize=(12, 6))
sns.barplot(data=metrics_long, x="Metric", y="Score", hue="Pipeline")
plt.title("Comparison of Metrics Across Pipelines")
plt.ylim(0, 1)
plt.ylabel("Score")
plt.legend(title="Pipeline")
plt.show()

metrics = ["Precision", "Recall", "F1-Score", "Accuracy"]
for metric in metrics:
    plt.figure(figsize=(8, 5))
    sns.barplot(data=metrics_df, x="Pipeline", y=metric, palette="viridis")
    plt.title(f"{metric} Comparison Across Pipelines")
    plt.ylim(0, 1)
    plt.ylabel(metric)
    plt.xlabel("Pipeline")
    plt.show()