In [None]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
import torch


device = 'cuda' if torch.cuda.is_available() else 'cpu'
NARRATIVE_THRESHOLD = 0.89
SUBNARRATIVE_THRESHOLD = 0.80
BEST_MODEL_CHECKPOINT_PATH = 'models/phase0_xlmr_best_model.bin'

In [None]:
from src.scripts.data_preparation import prepare_dataframes

# Define constants
DATA_FOLDER = 'data'

# Prepare the dataframes
(
    train_df,
    val_df,
    test_df,
    id_to_label,
    label_to_id,
    parent_child_pairs,
) = prepare_dataframes(data_folder=DATA_FOLDER)

num_total_labels = len(id_to_label)

print(f"Number of training examples: {len(train_df)}")
print(f"Number of validation examples: {len(val_df)}")
print(f"Number of testing examples: {len(test_df)}")
print(f"Number of labels: {num_total_labels}")

In [None]:
# --- Step 1: Analyze Class Distribution on the Training Set ---
from src.utils.metrics import get_class_distribution


print("--- Analyzing Training Set Class Distribution ---")
# Use the reusable function to get the counts
train_class_distribution = get_class_distribution(train_df, id_to_label)
# Display the rarest classes, which are your primary candidates for augmentation
print("\nTop 20 Rarest Classes in Training Data:")
print(train_class_distribution.head(20))

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from src.data_management.datasets import NarrativeClassificationDataset
from transformers import AutoModelForSequenceClassification
import torch

MODEL_NAME = 'xlm-roberta-base'
MAX_LENGTH = 512
BATCH_SIZE = 16

print("Creating validation dataset and dataloader...")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_total_labels,
    problem_type='multi_label_classification',
    id2label=id_to_label,
    label2id=label_to_id
)

model.load_state_dict(torch.load(BEST_MODEL_CHECKPOINT_PATH))
model.to(device)

In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

val_dataset = NarrativeClassificationDataset(val_df, tokenizer, max_length=MAX_LENGTH)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Validation dataset created with {len(val_dataset)} examples.")
print(f"Validation dataloader created with batch size {BATCH_SIZE}.")

In [None]:
# --- Step 2: Analyze Per-Class F1 Scores on the Validation Set ---
from src.training.engine import get_raw_predictions, compute_metrics


print("\n--- Analyzing Per-Class F1 Scores on Validation Set ---")
# First, get the predictions and true labels from your validation set
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)

In [None]:

from src.utils.metrics import get_per_class_f1_scores

# --- Step 5: Get Per-Class F1 Scores with Optimal Thresholds ---
print("\n--- Calculating Per-Class F1 Scores with Best Thresholds ---")

# We need to identify which columns in our label tensors correspond to narratives vs. sub-narratives
narrative_indices = [i for i, label in id_to_label.items() if label.count(':') == 1]
subnarrative_indices = [i for i, label in id_to_label.items() if label.count(':') == 2]

# Use the dedicated function to get the F1 scores per class
per_class_f1_df = get_per_class_f1_scores(
    true_labels=val_true_labels,
    pred_logits=val_logits,
    id_to_label_map=id_to_label,
    narrative_indices=narrative_indices,
    subnarrative_indices=subnarrative_indices,
    narrative_threshold=NARRATIVE_THRESHOLD, # Using the constant from the top of the notebook
    subnarrative_threshold=SUBNARRATIVE_THRESHOLD # Using the constant from the top of the notebook
)

print("\nPer-Class F1 Scores (sorted by F1 score):")
# Display the full dataframe to see all classes
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(per_class_f1_df)


In [None]:
# --- Step 6: Identify Least Performing Narratives and Subnarratives (Bottom 30%) ---

# Separate narratives and subnarratives by label format
narratives_df = per_class_f1_df[per_class_f1_df['label'].apply(lambda x: x.count(':') == 1)]
subnarratives_df = per_class_f1_df[per_class_f1_df['label'].apply(lambda x: x.count(':') == 2)]

# Calculate bottom 30% count for each
narr_bottom_n = max(1, int(len(narratives_df) * 0.3))
subnarr_bottom_n = max(1, int(len(subnarratives_df) * 0.3))

least_perf_narratives = narratives_df.nsmallest(narr_bottom_n, 'f1_score')
least_perf_subnarratives = subnarratives_df.nsmallest(subnarr_bottom_n, 'f1_score')

print("\n--- Least Performing Narratives (Bottom 30%) ---")
print(least_perf_narratives[['label', 'f1_score']])

print("\n--- Least Performing Subnarratives (Bottom 30%) ---")
print(least_perf_subnarratives[['label', 'f1_score']])

# Save as CSV for easy inspection and re-use
least_perf_narratives.to_csv('least_perf_narratives.csv', index=False)
least_perf_subnarratives.to_csv('least_perf_subnarratives.csv', index=False)
