# Goal of the Task
You are measuring how well your predicted .ann file (from part 2) matches the ground truth .ann file in the original/ sub-directory. Specifically, you're evaluating the model's span-level entity labeling.

# Why This Evaluation Metric?
The code uses Precision, Recall, and F1 Score, which are standard and interpretable metrics for evaluating sequence labeling tasks (like NER/BIO tagging). Here's why they are a good choice:

**Precision:** How many predicted spans are correct?

**Recall:** How many ground truth spans were correctly found?

**F1 Score:** Harmonic mean of precision and recall — balances both.

In [11]:
import os
import re
from collections import Counter

# ------------------------
# ⚙️ CONFIGURATION
# ------------------------
ground_truth_path = "./cadec/original/LIPITOR.331.ann"
predicted_path = "./predicted/LIPITOR.331.ann" 

# ------------------------
# 🔍 LOAD ANN FILES
# ------------------------
def load_ann_file(filepath):
    entries = set()
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            tag_id, tag_info, phrase = parts
            tag_parts = tag_info.split()
            label = tag_parts[0]
            start = int(tag_parts[1])
            end = int(tag_parts[-1])
            entries.add((start, end, label))
    return entries

# ------------------------
# 🧮 COMPUTE METRICS
# ------------------------
def evaluate_ann(predicted, ground_truth):
    tp = len(predicted & ground_truth)
    fp = len(predicted - ground_truth)
    fn = len(ground_truth - predicted)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "True Positives": tp,
        "False Positives": fp,
        "False Negatives": fn,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    }

# ------------------------
# ✅ RUN TASK 3 EVALUATION
# ------------------------
if not os.path.exists(ground_truth_path) or not os.path.exists(predicted_path):
    raise FileNotFoundError("Check that both ground truth and prediction files exist.")

gt_spans = load_ann_file(ground_truth_path)
pred_spans = load_ann_file(predicted_path)

metrics = evaluate_ann(pred_spans, gt_spans)

print("\n📊 Task 3 Evaluation Report (Span-Level):")
for k, v in metrics.items():
    print(f"{k}: {v}")


print("\n✅ Task 3 complete.")



📊 Task 3 Evaluation Report (Span-Level):
True Positives: 11
False Positives: 12
False Negatives: 8
Precision: 0.4783
Recall: 0.5789
F1 Score: 0.5238

✅ Task 3 complete.
