# Task 4: ADR-Only Evaluation (Text-Based) using MedDRA Ground Truth

This script evaluates the predicted annotations for ADR (Adverse Drug Reaction) mentions by comparing them against the MedDRA-standardized ground truth. It uses text-based normalization and standard precision, recall, and F1-score metrics.



In [1]:
import os
import re
from sklearn.metrics import precision_recall_fscore_support

def normalize_text(text):
    """Lowercase, remove punctuation, extra spaces."""
    return re.sub(r'[^\w\s]', '', text.lower()).strip()

def load_adr_texts(file_path, is_meddra=False):
    """Extract ADR mention texts from predicted or MedDRA file."""
    adr_texts = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#') or not line.strip():
                continue
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue

            if is_meddra:
                try:
                    text = parts[-1]
                    text = normalize_text(text)
                    if text:
                        adr_texts.add(text)
                except:
                    continue
            else:
                label_info = parts[1].split()
                label = label_info[0]
                if label != 'ADR':
                    continue
                try:
                    text = parts[-1]
                    text = normalize_text(text)
                    if text:
                        adr_texts.add(text)
                except:
                    continue
    return adr_texts

def evaluate_adr_text_only(pred_file, meddra_file):
    """Text-based evaluation of ADR mentions."""
    print(f"\n📄 Predicted File: {os.path.basename(pred_file)}")
    print(f"📄 MedDRA File:    {os.path.basename(meddra_file)}")

    pred_texts = load_adr_texts(pred_file, is_meddra=False)
    true_texts = load_adr_texts(meddra_file, is_meddra=True)

    print("\n✅ Ground Truth ADR Texts:")
    for t in sorted(true_texts): print("  •", t)

    print("\n🔮 Predicted ADR Texts:")
    for t in sorted(pred_texts): print("  •", t)

    y_true = []
    y_pred = []

    for pred in pred_texts:
        if pred in true_texts:
            y_pred.append(1)
            y_true.append(1)
        else:
            y_pred.append(1)
            y_true.append(0)

    for truth in true_texts:
        if truth not in pred_texts:
            y_pred.append(0)
            y_true.append(1)

    if not y_true:
        print("\n⚠️  No ADR labels found in ground truth. Cannot evaluate.")
        return

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

    print("\n📊 Text-Based Evaluation Metrics:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")

# Example usage
predicted_path = "./predicted/ARTHROTEC.104.ann"
meddra_path = r"./cadec/meddra/ARTHROTEC.104.ann"

evaluate_adr_text_only(predicted_path, meddra_path)



📄 Predicted File: ARTHROTEC.104.ann
📄 MedDRA File:    ARTHROTEC.104.ann

✅ Ground Truth ADR Texts:
  • constipation
  • diarrhea
  • fatigue

🔮 Predicted ADR Texts:
  • constipation
  • severe osteoarthritis
  • some diarrhea

📊 Text-Based Evaluation Metrics:
Precision: 0.3333
Recall:    0.3333
F1-score:  0.3333
