In [None]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
from src.data_management.loaders import load_labeled_df

df = load_labeled_df('phase0_baseline_labeled.parquet')
df.head()

# Splitting the dataset into training and testing sets

In [None]:
from skmultilearn.model_selection import iterative_train_test_split

# Split the dataset into training and testing sets
X = df.index.to_numpy().reshape(-1, 1)
y = np.array(df['labels'].tolist())

train_val_indices, y_train_val, test_indices, y_test = iterative_train_test_split(X, y, test_size = 0.2)


train_indices, y_train, val_indices, y_val = iterative_train_test_split(train_val_indices, y_train_val, test_size = 0.25)

train_df = df.loc[train_indices.flatten()]
val_df = df.loc[val_indices.flatten()]
test_df = df.loc[test_indices.flatten()]


# 5. Verify the results
print("Original dataset shape:", df.shape)
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

print("\nExample of train_df head:")
print(train_df.head())

# Tokenizing the dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
num_total_labels = df['labels'].iloc[0].shape[0]
print(f"Number of total labels: {num_total_labels}")

In [None]:
from src.data_management.label_parser import get_label_mappings

label_to_id, id_to_label, narrative_to_subnarrative_ids = get_label_mappings()
sub_to_narr_id_map = {}

# Create a mapping from sub-narrative IDs to their parent narrative IDs
for narr_id, sub_ids_list in narrative_to_subnarrative_ids.items():
    for sub_id in sub_ids_list:
        sub_to_narr_id_map[sub_id] = narr_id

# This gives you a map like: { sub_id_A: narr_id_1, sub_id_B: narr_id_1, ... }
# It's useful to also have a simple list of all parent-child ID pairs
parent_child_pairs = list(sub_to_narr_id_map.items())

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = num_total_labels,
    problem_type = 'multi_label_classification',
    id2label = id_to_label,
    label2id = label_to_id 
)

print("Model and tokenizer loaded successfully.")

In [None]:
from src.data_management.datasets import NarrativeClassificationDataset

BATCH_SIZE = 16
MAX_LENGTH = 512

print("Creating PyTorch datasets...")
train_dataset = NarrativeClassificationDataset(
    train_df,
    tokenizer,
    max_length = MAX_LENGTH,
)

test_dataset = NarrativeClassificationDataset(
    test_df,
    tokenizer,
    max_length = MAX_LENGTH,
)

val_dataset = NarrativeClassificationDataset(
    val_df,
    tokenizer,
    max_length = MAX_LENGTH,
)
print("PyTorch datasets created successfully.")

In [None]:
from torch.utils.data import DataLoader
print("Creating DataLoaders...")

train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 8,
    pin_memory = True,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 8,
    pin_memory = True,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 8,
    pin_memory = True,
)

print("DataLoaders created successfully.")
for batch in train_dataloader:
    print(batch['input_ids'].shape) # Should be [BATCH_SIZE, MAX_TOKEN_LEN]
    print(batch['labels'].shape)    # Should be [BATCH_SIZE, num_total_labels]
    break

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move your model to the selected device
model.to(device)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
print("Optimizer created successfully.")

In [None]:
from transformers.optimization import get_linear_schedule_with_warmup

EPOCH = 10
num_training_steps = len(train_dataloader) * EPOCH

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

print("Scheduler created successfully.")

In [None]:
from src.training.engine import train_epoch, evaluate
import torch.nn as nn
from tqdm.auto import tqdm
import torch

loss_function = nn.BCEWithLogitsLoss()
H_LAMBDA = 1.5
best_val_loss = float('inf')
patience = 3
patience_counter = 0


# for epoch in tqdm(range(EPOCH), desc="Epochs"):
#     print(f"Epoch {epoch+1}/{EPOCH}")
#
#     train_loss = train_epoch(
#         model,
#         train_dataloader,
#         optimizer,
#         scheduler,
#         loss_function,
#         device,
#         parent_child_pairs,
#         H_LAMBDA
#     )
    
#     val_loss, metrics = evaluate(
#         model,
#         val_dataloader,
#         loss_function,
#         device,
#         H_LAMBDA,
#         parent_child_pairs,
#         threshold=0.5 # Using a default threshold for validation during training
#     )
    
#     print(f"Validation F1-score (micro): {metrics['f1_micro']:.4f}")

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         patience_counter = 0
#         # Save the best model
#         torch.save(model.state_dict(), f'phase0_{MODEL_NAME}_best_model.bin')
#         print("Best model saved.")
#     else:
#         patience_counter += 1
#         if patience_counter >= patience:
#             print("Early stopping triggered.")
#             break


In [None]:
import torch
import numpy as np

# --- Import your new and existing functions ---
# Your existing compute_metrics is inside engine.py
from src.training.engine import get_raw_predictions, compute_metrics
from src.utils.metrics import find_best_threshold

# --- Assumed objects are available ---
# model, val_dataloader, test_dataloader, device, parent_child_pairs
MODEL_OUTPUT_PATH = "models/phase0_xlmr_best_model.bin"

# 1. LOAD THE BEST MODEL WEIGHTS SAVED DURING TRAINING
print("\n--- Loading best model for threshold finding and final evaluation ---")
model.load_state_dict(torch.load(MODEL_OUTPUT_PATH))

model.to(device) # Make sure model is on the correct device

# 2. GET PREDICTIONS ON THE VALIDATION SET
# Use the new, clean function from engine.py
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)

# 3. FIND THE OPTIMAL THRESHOLD
# Use the new function from metrics.py
optimal_threshold = find_best_threshold(
    val_logits,
    val_true_labels,
    parent_child_pairs,
    metric_to_optimize='f1_micro',
    compute_metrics_fn=compute_metrics # Pass your metrics function
)

# 4. FINAL EVALUATION ON THE UNSEEN TEST SET
print("\n--- Final Evaluation on TEST set using the Optimal Threshold ---")

# Get raw predictions for the test set
test_logits, test_true_labels = get_raw_predictions(model, test_dataloader, device)

# Calculate final metrics using your original compute_metrics function
# and the optimal_threshold you just found
final_metrics = compute_metrics(
    test_logits,
    test_true_labels,
    parent_child_pairs,
    threshold=optimal_threshold
)

print(f"Final Reportable Performance on Test Set:")
print(f"  - F1 Micro: {final_metrics['f1_micro']:.4f}")
print(f"  - F1 Macro: {final_metrics['f1_macro']:.4f}")



In [None]:
# Evaluate on the test set with the optimal threshold
print("Evaluating on the test set with the optimal threshold...")
# Load the best model before evaluating on the test set
model.load_state_dict(torch.load(f'models/phase0_xlmr_best_model.bin'))
test_loss, test_metrics = evaluate(
    model,
    test_dataloader,
    loss_function,
    device,
    H_LAMBDA,
    parent_child_pairs,
    threshold=optimal_threshold
)

print("\nTest Set Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")

# New data from the devset

In [None]:
PATH_TO_BEST_MODEL = 'models/phase0_xlmr_best_model.bin'
MODEL_NAME = 'xlm-roberta-base'
CONTINUAL_LEARNING_MODEL_PATH = 'models/phase0_xlmr_continual_learning_model.bin'
CONTINUAL_LEARNING_EPOCHS = 5
CONTINUAL_LEARNING_LR = 2e-6 
CONTINUAL_LEARNING_PATIENCE = 2

In [None]:
from src.scripts.data_preparation import prepare_datasets

print("--- Preparing original training data ---")

(
    original_train_dataset,
    original_val_dataset,
    original_test_dataset,
    tokenizer, 
    id_to_label, 
    label_to_id,
    parent_child_pairs, 
    num_total_labels, 
) = prepare_datasets(
    data_folder='data',
    model_name=MODEL_NAME,
    docs_folder='raw-documents'
)

print("\nOriginal datasets created:")
print(f"  - Original Train set size: {len(original_train_dataset)}")
print(f"  - Original Validation set size: {len(original_val_dataset)}")
print(f"  - Original Test set size: {len(original_test_dataset)}")

print("--- Preparing incremental training data from devset ---")
# We can reuse the same model name and max length from the initial setup.
# The tokenizer is already loaded, but prepare_datasets will load it again.
# This is okay for this demonstration.
(
    inc_train_dataset,
    inc_val_dataset,
    inc_test_dataset,
    _, # tokenizer - assuming it's the same
    _, # id_to_label - assuming it's the same
    _,
    _, # parent_child_pairs - assuming they are the same
    _, # num_total_labels - assuming it's the same
) = prepare_datasets(
    data_folder='devset',
    model_name=MODEL_NAME,
    docs_folder='subtask-2-documents'
)

print("\nIncremental datasets created:")
print(f"  - Incremental Train set size: {len(inc_train_dataset)}")
print(f"  - Incremental Validation set size: {len(inc_val_dataset)}")
print(f"  - Incremental Test set size: {len(inc_test_dataset)}")

# For incremental training, you would typically use `inc_train_dataset`.
# You might also combine it with the original training set or use `inc_val_dataset`
# for evaluating the model's performance during continual learning.

In [None]:
from torch.utils.data import ConcatDataset

# Combine the original training data with all parts of the incremental data
combined_train_dataset = ConcatDataset([
    original_train_dataset,
    inc_train_dataset,
    inc_val_dataset,
    inc_test_dataset
])

print(f"--- Combined Training Dataset ---")
print(f"Original training set size: {len(original_train_dataset)}")
print(f"Incremental train set size: {len(inc_train_dataset)}")
print(f"Incremental validation set size: {len(inc_val_dataset)}")
print(f"Incremental test set size: {len(inc_test_dataset)}")
print(f"Total combined training set size: {len(combined_train_dataset)}")

# The original validation and test sets remain unchanged for final evaluation
print(f"\n--- Evaluation Datasets ---")
print(f"Original validation set size: {len(original_val_dataset)}")
print(f"Original test set size: {len(original_test_dataset)}")

In [None]:
from tqdm.auto import tqdm
from src.training.engine import train_epoch, evaluate
from transformers import AutoModelForSequenceClassification

best_val_loss = float('inf')

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_total_labels,
    problem_type='multi_label_classification',
    id2label=id_to_label,
    label2id=label_to_id
)
model.load_state_dict(torch.load(PATH_TO_BEST_MODEL))

for epoch in tqdm(range(CONTINUAL_LEARNING_EPOCHS), desc="Continual Learning Epochs"):
    print(f"\nEpoch {epoch + 1}/{CONTINUAL_LEARNING_EPOCHS}")
    
    train_loss = train_epoch(
        model,
        DataLoader(combined_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=True),
        optimizer,
        scheduler,
        loss_function,
        device,
        parent_child_pairs,
        H_LAMBDA
    )
    
    val_loss, metrics = evaluate(
        model,
        DataLoader(original_val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8, pin_memory=True),
        loss_function,
        device,
        H_LAMBDA,
        parent_child_pairs,
        threshold=optimal_threshold
    )
    
    print(f"Validation F1-score (micro): {metrics['f1_micro']:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), CONTINUAL_LEARNING_MODEL_PATH)
        print("Best model saved.")
    else:
        patience_counter += 1
        if patience_counter >= CONTINUAL_LEARNING_PATIENCE:
            print("Early stopping triggered.")
            break