In [1]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas numpy nltk

Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [2]:
# -------------------------------
# 📌 Task5_Random50.ipynb
# -------------------------------

import os
import random
import json
import pandas as pd

# -------------------------------
# Paths
# -------------------------------
DATA_DIR = r"../data/CADEC.v2"   # ground truth
PRED_DIR = r"../outputs/task2"   # predictions from Task 2
OUTPUT_DIR = r"../outputs/task5"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------------
# Helper functions
# -------------------------------
def read_ground_truth_spans_with_offsets(ann_file):
    """Read spans from ground truth .ann file in original/"""
    spans = []
    with open(ann_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            label_info = parts[1].split()
            label = label_info[0]
            try:
                start, end = int(label_info[1]), int(label_info[2])
            except:
                continue
            text = parts[2].strip()
            spans.append((label, start, end, text))
    return spans

def load_predicted_spans(json_file):
    """Load spans from Task 2 JSON predictions"""
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    spans = []
    for item in data:
        try:
            start, end = map(int, item["span"].split())
            spans.append((item["label"], start, end, item["text"]))
        except:
            continue
    return spans

def overlap(a_start, a_end, b_start, b_end):
    """Check if two spans overlap"""
    return max(a_start, b_start) < min(a_end, b_end)

# -------------------------------
# Step 1: Pick Random 50 Files
# -------------------------------
all_pred_json = [f for f in os.listdir(PRED_DIR) if f.endswith("_predictions.json")]
random.shuffle(all_pred_json)
sampled_files = all_pred_json[:50]

# Save sampled list
sampled_path = os.path.join(OUTPUT_DIR, "step5_sampled_files.txt")
with open(sampled_path, "w") as f:
    for file in sampled_files:
        f.write(file.replace("_predictions.json", ".txt") + "\n")

print(f"📄 50 random files selected and saved: {sampled_path}")

# -------------------------------
# Step 2: Relaxed Evaluation
# -------------------------------
results = []
skipped = 0

for pred_file in sampled_files:
    base = pred_file.replace("_predictions.json", "")
    gt_ann = os.path.join(DATA_DIR, "original", f"{base}.ann")
    pred_path = os.path.join(PRED_DIR, pred_file)

    if not os.path.exists(gt_ann) or not os.path.exists(pred_path):
        skipped += 1
        continue

    gt_spans = read_ground_truth_spans_with_offsets(gt_ann)
    pred_spans = load_predicted_spans(pred_path)

    gt_matched, pred_matched = set(), set()

    for pi, (plabel, pstart, pend, ptext) in enumerate(pred_spans):
        for gi, (glabel, gstart, gend, gtext) in enumerate(gt_spans):
            if plabel.lower() == glabel.lower() and overlap(pstart, pend, gstart, gend):
                gt_matched.add(gi)
                pred_matched.add(pi)

    tp = len(pred_matched)
    fp = len(pred_spans) - len(pred_matched)
    fn = len(gt_spans) - len(gt_matched)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    results.append({"file": base, "precision": precision, "recall": recall, "f1": f1})

# -------------------------------
# Step 3: Macro Scores
# -------------------------------
avg_precision = sum(r["precision"] for r in results) / len(results) if results else 0.0
avg_recall = sum(r["recall"] for r in results) / len(results) if results else 0.0
avg_f1 = sum(r["f1"] for r in results) / len(results) if results else 0.0

print(f"[RELAXED] Evaluated {len(results)} posts. Skipped {skipped} due to missing files.")
print(f"[RELAXED] Macro Precision: {avg_precision:.3f}")
print(f"[RELAXED] Macro Recall:    {avg_recall:.3f}")
print(f"[RELAXED] Macro F1-score:  {avg_f1:.3f}")

for r in results[:20]:  # show first 20 files
    print(f"{r['file']}: Precision={r['precision']:.3f}, Recall={r['recall']:.3f}, F1={r['f1']:.3f}")

# -------------------------------
# Step 4: Save Results
# -------------------------------
df = pd.DataFrame(results)
csv_path = os.path.join(OUTPUT_DIR, "task5_metrics.csv")
json_path = os.path.join(OUTPUT_DIR, "task5_metrics.json")

df.to_csv(csv_path, index=False)
with open(json_path, "w", encoding="utf-8") as f:
    json.dump({
        "Macro Precision": avg_precision,
        "Macro Recall": avg_recall,
        "Macro F1": avg_f1,
        "Per-file Results": results
    }, f, indent=2)

print(f"\n✅ Task 5 Evaluation complete!")
print(f"- Results saved: {csv_path}")
print(f"- Results saved: {json_path}")


📄 50 random files selected and saved: ../outputs/task5\step5_sampled_files.txt
[RELAXED] Evaluated 50 posts. Skipped 0 due to missing files.
[RELAXED] Macro Precision: 0.483
[RELAXED] Macro Recall:    0.435
[RELAXED] Macro F1-score:  0.426
LIPITOR.516: Precision=0.000, Recall=0.000, F1=0.000
LIPITOR.667: Precision=1.000, Recall=1.000, F1=1.000
ARTHROTEC.57: Precision=0.000, Recall=0.000, F1=0.000
LIPITOR.512: Precision=1.000, Recall=0.667, F1=0.800
LIPITOR.9: Precision=0.095, Recall=0.222, F1=0.133
LIPITOR.562: Precision=0.250, Recall=1.000, F1=0.400
LIPITOR.311: Precision=0.250, Recall=0.250, F1=0.250
LIPITOR.555: Precision=1.000, Recall=0.500, F1=0.667
LIPITOR.658: Precision=0.385, Recall=0.455, F1=0.417
ARTHROTEC.70: Precision=0.250, Recall=0.500, F1=0.333
LIPITOR.364: Precision=1.000, Recall=0.500, F1=0.667
LIPITOR.536: Precision=0.600, Recall=0.600, F1=0.600
LIPITOR.699: Precision=0.500, Recall=0.625, F1=0.556
LIPITOR.792: Precision=0.750, Recall=0.429, F1=0.545
LIPITOR.144: Preci