# 05 - Evaluation and Error Analysis

## Goal

Evaluate on held-out test set and do quick error analysis to learn where the model fails.


In [14]:
# === TODO (you code this) ===
# Goal: Import libraries for model evaluation.
# Hints:
# 1) pandas, numpy, transformers (AutoTokenizer, AutoModelForSequenceClassification)
# 2) torch, sklearn.metrics
# Acceptance:
# - All imports successful
import pandas as pd 
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report, roc_auc_score, average_precision_score





In [15]:
# === TODO (you code this) ===
# Goal: Define LABELS (MUST match training order exactly!).
# Acceptance:
# - LABELS list with all 10 categories in correct order

# TODO: define LABELS constant
LABELS = [
    'SystematicReview',  # 1. Systematic reviews
    'MetaAnalysis',      # 2. Meta-analyses (quantitative synthesis)
    'RCT',               # 3. Randomized Controlled Trials
    'ClinicalTrial',     # 4. Non-randomized clinical trials
    'Cohort',            # 5. Cohort studies (prospective/retrospective)
    'CaseControl',       # 6. Case-control studies
    'CaseReport',        # 7. Case reports / case series
    'InVitro',           # 8. In vitro or ex vivo laboratory studies
    'Animal',            # 9. Animal studies
    'Human'              # 10. Human subjects (not mutually exclusive)
]



## Load Model & Tokenizer


In [17]:
# === TODO (you code this) ===
# Goal: Load trained model and tokenizer.
# Hints:
# 1) Load from ../artifacts/model/best
# 2) Set model to eval mode
# Acceptance:
# - tokenizer and model loaded
# - model.eval() called

# TODO: load model and tokenizer
model_path = '../artifacts/model/best'

# Load tokenizer (uses tokenizer_config.json, tokenizer.json, vocab.txt, special_tokens_map.json)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model (uses config.json and model.safetensors)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set to eval mode
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Load Test Set


In [18]:
# === TODO (you code this) ===
# Goal: Load test set and create text column.
# Hints:
# 1) Load test.parquet from ../data/processed
# 2) Concatenate title + abstract (truncate to 2000 chars)
# Acceptance:
# - test_df loaded with 'text' column

# TODO: load test set
test_df = pd.read_parquet('../data/processed/test.parquet')
test_df['text'] = test_df['title'] + ' ' + test_df['abstract']
test_df['text'] = test_df['text'].str[:2000]

# Check acceptance
assert 'text' in test_df.columns
assert len(test_df) > 0


## Predict Probabilities


In [20]:
print(test_df.dtypes)

title           object
abstract        object
journal         object
year           float64
pub_types       object
mesh_terms      object
pmid            object
labels          object
label_count      int64
split           object
text            object
dtype: object


In [21]:
# === TODO (you code this) ===
# Goal: Generate probability predictions for test set.
# Hints:
# 1) Batch texts, tokenize, run through model
# 2) Apply sigmoid to logits
# 3) Return (n_samples, n_labels) array
# Acceptance:
# - Function predict_probs(texts, batch_size) -> np.array
# - test_probs shape is (len(test_df), 10)

def predict_probs(texts, batch_size=16):
    """Generate probability predictions for texts."""
    # TODO
    if hasattr(texts, "tolist"):
        texts = texts.tolist()

    preds = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            preds.append(probs)
    return np.concatenate(preds)

test_probs = predict_probs(test_df['text'])
assert test_probs.shape == (len(test_df), 10)

# Check acceptance
assert test_probs.shape == (len(test_df), 10)   



## Threshold Selection

Default: 0.5 global threshold. For better performance, tune per-label on validation set.


In [22]:
# === TODO (you code this) ===
# Goal: Convert probabilities to binary predictions.
# Hints:
# 1) Use threshold (e.g., 0.5)
# 2) Cast to int
# Acceptance:
# - test_preds is binary matrix (0 or 1)

# TODO: apply threshold to create test_preds
test_preds = (test_probs >= 0.5).astype(int)

# Check acceptance
assert test_preds.shape == (len(test_df), 10)

In [25]:
# === TODO (you code this) ===
# Goal: Build ground truth binary matrix.
# Hints:
# 1) Convert test_df['labels'] lists to binary vectors
# 2) Same binarize logic as training
# Acceptance:
# - test_true shape matches test_preds

# TODO: create test_true matrix
# You need to convert each label list to a binary vector
# Example: ['Human', 'RCT'] → [0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
#          (RCT is at index 2, Human is at index 9)

def binarize_labels(labels_list):
    """Convert list of labels to multi-hot vector."""
    zero_vector = [0.0] * 10  # 10 labels
    for label in labels_list:
        if label in LABELS:
            zero_vector[LABELS.index(label)] = 1.0
    return zero_vector

# Apply to all rows
test_true = np.array([binarize_labels(labels) for labels in test_df['labels']])

# Acceptance test
assert test_true.shape == test_preds.shape

## Aggregate Metrics


In [30]:
# === TODO (you code this) ===
# Goal: Compute and display aggregate metrics.
# Hints:
# 1) Use sklearn precision_recall_fscore_support
# 2) Compute both micro and macro averages
# 3) Print formatted results
# Acceptance:
# - Shows 6 metrics: precision/recall/f1 for micro and macro

# TODO: compute and print metrics
target_names = LABELS

score_results = precision_recall_fscore_support(test_true, test_preds, average='micro')

print(f"Micro-average precision: {score_results[0]:.4f}")
print(f"Micro-average recall: {score_results[1]:.4f}")
print(f"Micro-average F1 score: {score_results[2]:.4f}")

print()

macro_results = precision_recall_fscore_support(test_true, test_preds, average='macro')

print(f"Macro-average precision: {macro_results[0]:.4f}")
print(f"Macro-average recall: {macro_results[1]:.4f}")
print(f"Macro-average F1 score: {macro_results[2]:.4f}")





Micro-average precision: 0.8966
Micro-average recall: 0.8868
Micro-average F1 score: 0.8917

Macro-average precision: 0.8201
Macro-average recall: 0.7596
Macro-average F1 score: 0.7397


## Per-label Performance


In [31]:
# === TODO (you code this) ===
# Goal: Print per-label performance report.
# Hints:
# 1) Use sklearn classification_report
# 2) Pass target_names=LABELS for readable output
# Acceptance:
# - Shows precision/recall/f1/support for each label

# TODO: print classification_report
print(classification_report(test_true, test_preds, target_names=target_names))



                  precision    recall  f1-score   support

SystematicReview       0.81      0.93      0.87      1326
    MetaAnalysis       0.77      0.97      0.86       601
             RCT       0.70      0.92      0.80      1046
   ClinicalTrial       0.64      0.28      0.39       103
          Cohort       0.69      0.89      0.78      1768
     CaseControl       0.89      0.04      0.08      1513
      CaseReport       0.95      0.89      0.92      1409
         InVitro       0.93      0.93      0.93      2183
          Animal       0.86      0.79      0.82      1651
           Human       0.95      0.96      0.96     16489

       micro avg       0.90      0.89      0.89     28089
       macro avg       0.82      0.76      0.74     28089
    weighted avg       0.90      0.89      0.87     28089
     samples avg       0.92      0.93      0.90     28089



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Error Analysis Samples

Inspect false positives and false negatives.


In [33]:
# === TODO (you code this) ===
# Goal: Show false positives/negatives for selected labels.
# Hints:
# 1) Choose 1-2 interesting labels (e.g., RCT, SystematicReview)
# 2) Find where pred != true using boolean masks
# 3) Print pmid, title snippet, true labels, predicted prob
# Acceptance:
# - Shows 3-5 error examples with context

# TODO: display error samples
choosen_labels = ["CaseControl", "ClinicalTrial"]

for label in choosen_labels:
    print(f"Label: {label}")
    print("-" * len(label))
    
    # Get indices where pred != true
    error_indices = np.where(test_true[:, LABELS.index(label)] != test_preds[:, LABELS.index(label)])[0]
    
    for idx in error_indices:
        print(f"PMID: {test_df.iloc[idx]['pmid']}")
        print(f"Title: {test_df.iloc[idx]['title']}")
        print(f"Abstract: {test_df.iloc[idx]['abstract']}")
        print(f"True labels: {test_df.iloc[idx]['labels']}")
        print(f"Predicted labels: {test_preds[idx]}")
        print()
        print("-" * 50)
        print()

print("=" * 100)
print("Some samples for each label to check the error analysis")
print("=" * 100)

# A 10 samples for each label to check the error analysis
for label in choosen_labels:
    print(f"Label: {label}")
    print("-" * len(label))
    error_indices = np.where(test_true[:, LABELS.index(label)] != test_preds[:, LABELS.index(label)])[0]
    for idx in error_indices[:10]:
        print(f"PMID: {test_df.iloc[idx]['pmid']}")
        print(f"Title: {test_df.iloc[idx]['title']}")   
        print(f"Abstract: {test_df.iloc[idx]['abstract']}")
        print(f"True labels: {test_df.iloc[idx]['labels']}")
        print(f"Predicted labels: {test_preds[idx]}")
        print()
        print("-" * 50)
        print()

    




Label: CaseControl
-----------
PMID: 39218838
Title: Evaluating the efficiency of mandibular molar protraction using Herbst appliances versus temporary anchorage devices: a retrospective case-controlled study.
Abstract: Mandibular second premolar agenesis is a common problem in orthodontics and is often treated in conjunction with maxillary counterbalancing extractions. However, in cases without maxillary crowding or dental protrusion, space closure may pose challenges leading to compromised occlusal results or patient profile. Multiple techniques have been described to treat these patients; nevertheless, there is a paucity of data comparing effectiveness of space closure utilizing various anchorage techniques. The goal of this study is to assess the effectiveness of the Herbst device during mandibular molar protraction and compare it to the use of temporary anchorage device (TADs) in patients with mandibular second premolar agenesis. This retrospective study included 33 patients with 

## Recommendations

- **Inspect confusion** between ClinicalTrial vs RCT
- Consider **merging ultra-rare labels** or reweighting if necessary
- **Per-label threshold tuning** can improve F1 for imbalanced classes

## 🧘 Reflection Log

**What did you learn in this session?**
- 

**What challenges did you encounter?**
- 

**How will this improve Periospot AI?**
- 
