**Task5**

In [36]:
import os
import random
from collections import Counter

# ------------------------
# ⚙️ CONFIGURATION
# ------------------------
ground_truth_dir = "./cadec/original"
predicted_dir = "./predicted"
num_files_to_evaluate = 5

# ------------------------
# 🔍 LOAD ANN FILES
# ------------------------
def load_ann_file(filepath):
    entries = set()
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            tag_id, tag_info, phrase = parts
            tag_parts = tag_info.split()
            label = tag_parts[0]
            start = int(tag_parts[1])
            end = int(tag_parts[-1])
            entries.add((start, end, label))
    return entries

# ------------------------
# 🧮 COMPUTE METRICS
# ------------------------
def evaluate_ann(predicted, ground_truth):
    tp = len(predicted & ground_truth)
    fp = len(predicted - ground_truth)
    fn = len(ground_truth - predicted)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "True Positives": tp,
        "False Positives": fp,
        "False Negatives": fn,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    }

# ------------------------
# ✅ RUN EVALUATION ON RANDOM FILES
# ------------------------
def evaluate_directory(ground_truth_dir, predicted_dir, num_files):
    # Get all .ann files in the predicted directory
    ann_files = [f for f in os.listdir(predicted_dir) if f.endswith(".ann")]
    
    # Select random subset of files
    if len(ann_files) > num_files:
        ann_files = random.sample(ann_files, num_files)
    
    all_metrics = []
    
    print(f"\n📊 Evaluating {num_files} random files from {predicted_dir}")
    
    for file_name in ann_files:
        gt_path = os.path.join(ground_truth_dir, file_name)
        pred_path = os.path.join(predicted_dir, file_name)
        
        if not os.path.exists(gt_path):
            print(f"❌ Ground truth file not found for {file_name}, skipping...")
            continue
        
        if not os.path.exists(pred_path):
            print(f"❌ Predicted file not found for {file_name}, skipping...")
            continue
        
        print(f"\n📄 Evaluating: {file_name}")
        gt_spans = load_ann_file(gt_path)
        pred_spans = load_ann_file(pred_path)
        
        metrics = evaluate_ann(pred_spans, gt_spans)
        all_metrics.append(metrics)
        
        print(f"Metrics for {file_name}:")
        for k, v in metrics.items():
            print(f"  {k}: {v}")
    
    # Compute average metrics
    if all_metrics:
        avg_metrics = {
            "True Positives": sum(m["True Positives"] for m in all_metrics) / len(all_metrics),
            "False Positives": sum(m["False Positives"] for m in all_metrics) / len(all_metrics),
            "False Negatives": sum(m["False Negatives"] for m in all_metrics) / len(all_metrics),
            "Precision": round(sum(m["Precision"] for m in all_metrics) / len(all_metrics), 4),
            "Recall": round(sum(m["Recall"] for m in all_metrics) / len(all_metrics), 4),
            "F1 Score": round(sum(m["F1 Score"] for m in all_metrics) / len(all_metrics), 4)
        }
        
        print("\n📈 Average Metrics Across Files:")
        for k, v in avg_metrics.items():
            print(f"  {k}: {v}")
    
    print("\n✅ Evaluation complete.")

if __name__ == "__main__":
    if not os.path.exists(ground_truth_dir) or not os.path.exists(predicted_dir):
        raise FileNotFoundError("Check that both ground truth and predicted directories exist.")
    
    evaluate_directory(ground_truth_dir, predicted_dir, num_files_to_evaluate)


📊 Evaluating 5 random files from ./predicted

📄 Evaluating: ARTHROTEC.105.ann
Metrics for ARTHROTEC.105.ann:
  True Positives: 2
  False Positives: 4
  False Negatives: 9
  Precision: 0.3333
  Recall: 0.1818
  F1 Score: 0.2353

📄 Evaluating: ARTHROTEC.64.ann
Metrics for ARTHROTEC.64.ann:
  True Positives: 2
  False Positives: 6
  False Negatives: 7
  Precision: 0.25
  Recall: 0.2222
  F1 Score: 0.2353

📄 Evaluating: LIPITOR.844.ann
Metrics for LIPITOR.844.ann:
  True Positives: 1
  False Positives: 7
  False Negatives: 5
  Precision: 0.125
  Recall: 0.1667
  F1 Score: 0.1429

📄 Evaluating: LIPITOR.846.ann
Metrics for LIPITOR.846.ann:
  True Positives: 0
  False Positives: 4
  False Negatives: 2
  Precision: 0.0
  Recall: 0.0
  F1 Score: 0.0

📄 Evaluating: ARTHROTEC.100.ann
Metrics for ARTHROTEC.100.ann:
  True Positives: 1
  False Positives: 7
  False Negatives: 4
  Precision: 0.125
  Recall: 0.2
  F1 Score: 0.1538

📈 Average Metrics Across Files:
  True Positives: 1.2
  False Positiv