In [None]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
import torch

optimal_threshold = 0.81
PATH_TO_BEST_MODEL = 'models/phase0_xlmr_best_model.bin'
MODEL_NAME = 'xlm-roberta-base'
CONTINUAL_LEARNING_MODEL_PATH = 'models/phase0_xlmr_continual_learning_model.bin'
CONTINUAL_LEARNING_EPOCHS = 5
CONTINUAL_LEARNING_LR = 2e-6 
CONTINUAL_LEARNING_PATIENCE = 2
BATCH_SIZE = 16
MAX_LENGTH = 512
H_LAMBDA = 1.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
from src.scripts.data_preparation import prepare_datasets

print("--- Preparing original training data ---")

(
    original_train_dataset,
    original_val_dataset,
    original_test_dataset,
    tokenizer, 
    id_to_label, 
    label_to_id,
    parent_child_pairs, 
    num_total_labels, 
) = prepare_datasets(
    data_folder='data',
    model_name=MODEL_NAME,
    docs_folder='raw-documents'
)

print("\nOriginal datasets created:")
print(f"  - Original Train set size: {len(original_train_dataset)}")
print(f"  - Original Validation set size: {len(original_val_dataset)}")
print(f"  - Original Test set size: {len(original_test_dataset)}")

print("--- Preparing incremental training data from devset ---")
# We can reuse the same model name and max length from the initial setup.
# The tokenizer is already loaded, but prepare_datasets will load it again.
# This is okay for this demonstration.
(
    inc_train_dataset,
    inc_val_dataset,
    inc_test_dataset,
    _, # tokenizer - assuming it's the same
    _, # id_to_label - assuming it's the same
    _,
    _, # parent_child_pairs - assuming they are the same
    _, # num_total_labels - assuming it's the same
) = prepare_datasets(
    data_folder='devset',
    model_name=MODEL_NAME,
    docs_folder='subtask-2-documents'
)

print("\nIncremental datasets created:")
print(f"  - Incremental Train set size: {len(inc_train_dataset)}")
print(f"  - Incremental Validation set size: {len(inc_val_dataset)}")
print(f"  - Incremental Test set size: {len(inc_test_dataset)}")

In [None]:
from torch.utils.data import ConcatDataset

# Combine the original training data with all parts of the incremental data
combined_train_dataset = ConcatDataset([
    original_train_dataset,
    inc_train_dataset,
    inc_val_dataset,
    inc_test_dataset
])

print(f"--- Combined Training Dataset ---")
print(f"Original training set size: {len(original_train_dataset)}")
print(f"Incremental train set size: {len(inc_train_dataset)}")
print(f"Incremental validation set size: {len(inc_val_dataset)}")
print(f"Incremental test set size: {len(inc_test_dataset)}")
print(f"Total combined training set size: {len(combined_train_dataset)}")

# The original validation and test sets remain unchanged for final evaluation
print(f"\n--- Evaluation Datasets ---")
print(f"Original validation set size: {len(original_val_dataset)}")
print(f"Original test set size: {len(original_test_dataset)}")

In [None]:
import torch
from tqdm.auto import tqdm
from src.training.engine import train_epoch, evaluate
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader


best_val_loss = float('inf')

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_total_labels,
    problem_type='multi_label_classification',
    id2label=id_to_label,
    label2id=label_to_id
)
model.load_state_dict(torch.load(PATH_TO_BEST_MODEL))
model.to(device)

In [None]:
train_dataloader = DataLoader(
    combined_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
)

val_dataloader = DataLoader(
    original_val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
)

test_dataloader = DataLoader(
    original_test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
)

In [None]:
from src.training.setup import setup_optimizer_and_scheduler


optimizer, scheduler = setup_optimizer_and_scheduler(
    model=model,
    learning_rate=CONTINUAL_LEARNING_LR,
    epochs=CONTINUAL_LEARNING_EPOCHS,
    train_dataloader=train_dataloader,
)

In [None]:
import torch.nn as nn

loss_function = nn.BCEWithLogitsLoss()
patience_counter = 0
best_val_loss = float('inf')

for epoch in tqdm(range(CONTINUAL_LEARNING_EPOCHS), desc="Continual Learning Epochs"):
    print(f"\nEpoch {epoch + 1}/{CONTINUAL_LEARNING_EPOCHS}")
    
    train_loss = train_epoch(
        model,
        train_dataloader,
        optimizer,
        scheduler,
        loss_function,
        device,
        parent_child_pairs,
        H_LAMBDA
    )
    
    val_loss, metrics = evaluate(
        model,
        val_dataloader,
        loss_function,
        device,
        H_LAMBDA,
        parent_child_pairs,
        threshold=optimal_threshold
    )
    
    print(f"Validation F1-score (micro): {metrics['f1_micro']:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), CONTINUAL_LEARNING_MODEL_PATH)
        print("Best model saved.")
    else:
        patience_counter += 1
        if patience_counter >= CONTINUAL_LEARNING_PATIENCE:
            print("Early stopping triggered.")
            break

In [None]:
import torch
import numpy as np

# --- Import your new and existing functions ---
# Your existing compute_metrics is inside engine.py
from src.training.engine import get_raw_predictions, compute_metrics
from src.utils.metrics import find_best_threshold

# --- Assumed objects are available ---
# model, val_dataloader, test_dataloader, device, parent_child_pairs
MODEL_OUTPUT_PATH = "models/phase0_xlmr_best_model.bin"

# 1. LOAD THE BEST MODEL WEIGHTS SAVED DURING TRAINING
print("\n--- Loading best model for threshold finding and final evaluation ---")
model.load_state_dict(torch.load(MODEL_OUTPUT_PATH))

model.to(device) # Make sure model is on the correct device

# 2. GET PREDICTIONS ON THE VALIDATION SET
# Use the new, clean function from engine.py
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)

# 3. FIND THE OPTIMAL THRESHOLD
# Use the new function from metrics.py
optimal_threshold = find_best_threshold(
    val_logits,
    val_true_labels,
    parent_child_pairs,
    metric_to_optimize='f1_micro',
    compute_metrics_fn=compute_metrics # Pass your metrics function
)

# 4. FINAL EVALUATION ON THE UNSEEN TEST SET
print("\n--- Final Evaluation on TEST set using the Optimal Threshold ---")

# Get raw predictions for the test set
test_logits, test_true_labels = get_raw_predictions(model, test_dataloader, device)

# Calculate final metrics using your original compute_metrics function
# and the optimal_threshold you just found
final_metrics = compute_metrics(
    test_logits,
    test_true_labels,
    parent_child_pairs,
    threshold=optimal_threshold
)

print(f"Final Reportable Performance on Test Set:")
print(f"  - F1 Micro: {final_metrics['f1_micro']:.4f}")
print(f"  - F1 Macro: {final_metrics['f1_macro']:.4f}")


# Finding multilevel thresholds

In [None]:
narrative_indices = [
    idx for idx in range(num_total_labels) 
    if id_to_label[idx].count(':') == 1
]

subnarrative_indices = [
    idx for idx in range(num_total_labels)
    if id_to_label[idx].count(':') == 2
]

print(f"Found {len(narrative_indices)} narrative-level labels.")
print(f"Found {len(subnarrative_indices)} sub-narrative-level labels.")

In [None]:
from src.utils.metrics import find_per_level_thresholds
from src.training.engine import get_raw_predictions


print("Getting raw logits from validation set to find best thresholds...")
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)

print("Converting logits to probabilities...")
val_probabilities = 1 / (1 + np.exp(-val_logits))

threshold_results = find_per_level_thresholds(
    val_probabilities,
    val_true_labels,
    narrative_indices,
    subnarrative_indices,
    parent_child_pairs
)

optimal_narr_thresh = threshold_results['narrative_threshold']
optimal_subnarr_thresh = threshold_results['subnarrative_threshold']

print("\n--- Final Evaluation on TEST set using Per-Level Thresholds ---")

# Get raw logits for the test set
test_logits, test_true_labels = get_raw_predictions(model, test_dataloader, device)

# Convert test logits to probabilities
test_probabilities = 1 / (1 + np.exp(-test_logits))

# Apply the two different thresholds to the test probabilities
final_test_preds = np.zeros_like(test_probabilities, dtype=int)
final_test_preds[:, narrative_indices] = (test_probabilities[:, narrative_indices] > optimal_narr_thresh).astype(int)
final_test_preds[:, subnarrative_indices] = (test_probabilities[:, subnarrative_indices] > optimal_subnarr_thresh).astype(int)



In [None]:
from src.training.engine import compute_metrics

# Evaluate the final_test_preds (using per-level thresholds) against the true test labels
final_per_level_metrics = compute_metrics(
    final_test_preds,
    test_true_labels,
    parent_child_pairs,
    threshold=None  # Already thresholded predictions
)

print("Test set results using per-level thresholds:")
print(f"  - F1 Micro: {final_per_level_metrics['f1_micro']:.4f}")
print(f"  - F1 Macro: {final_per_level_metrics['f1_macro']:.4f}")