In [1]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


# Task-4: ADR-only Performance Evaluation

In this task, we evaluate **only the ADR (Adverse Drug Reaction)** predictions.  
Unlike Task-3, where all labels (Drug, Disease, Symptom, ADR) were evaluated,  
here the **ground truth is taken from the `meddra/` folder**, which contains  
only ADR annotations.

We will compare our model's predicted ADRs (`.pred.ann`) with the ground truth ADRs from `meddra/`,  
and compute **Precision, Recall, and F1-score**.


In [4]:
import os, sys
sys.path.append("../utils")

from evaluation_adr import (
    read_ann_file_all_as_adr,
    read_ann_file_adr_only,
    compare_entities,
    compute_metrics
)

DATA_DIR = "../data/CADEC.v2"
OUTPUT_DIR = "../outputs"

# Example: Evaluate ARTHROTEC.1
filename = "ARTHROTEC.1"
gt_ann_file = os.path.join(DATA_DIR, "meddra", f"{filename}.ann")
pred_ann_file = os.path.join(OUTPUT_DIR, "task2", f"{filename}.pred.ann")

# --- Ground Truth ADR Entities (all treated as ADR) ---
print("--- Ground Truth ADR Entities (meddra, all treated as ADR) ---")
gt_entities = read_ann_file_all_as_adr(gt_ann_file)
for ent in gt_entities:
    print(ent)

# --- Predicted ADR Entities (only ADR label) ---
print("\n--- Predicted ADR Entities (only ADR label) ---")
if os.path.exists(pred_ann_file):
    pred_entities = read_ann_file_adr_only(pred_ann_file)
    for ent in pred_entities:
        print(ent)
else:
    pred_entities = []
    print("No prediction file found. Please generate predictions in Task 2.")

# --- Compare and Compute Metrics ---
tp, fp, fn = compare_entities(pred_entities, gt_entities)
metrics = compute_metrics(tp, fp, fn)

print("\n--- Evaluation Metrics (ADR only, using meddra ground truth) ---")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall:    {metrics['recall']:.2f}")
print(f"F1-score:  {metrics['f1']:.2f}")
print(f"True Positives: {len(tp)}")
print(f"False Positives: {len(fp)}")
print(f"False Negatives: {len(fn)}")


--- Ground Truth ADR Entities (meddra, all treated as ADR) ---
('ADR', 9, 19, 'bit drowsy')
('ADR', 29, 50, 'little blurred vision')
('ADR', 62, 78, 'gastric problems')
('ADR', 437, 453, 'feel a bit weird')

--- Predicted ADR Entities (only ADR label) ---
('ADR', 13, 19, 'drowsy')
('ADR', 36, 43, 'blurred')
('ADR', 93, 96, 'art')
('ADR', 412, 417, 'pains')

--- Evaluation Metrics (ADR only, using meddra ground truth) ---
Precision: 0.00
Recall:    0.00
F1-score:  0.00
True Positives: 0
False Positives: 4
False Negatives: 4
