In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [2]:
import torch


device = 'cuda' if torch.cuda.is_available() else 'cpu'
NARRATIVE_THRESHOLD = 0.89
SUBNARRATIVE_THRESHOLD = 0.80
BEST_MODEL_CHECKPOINT_PATH = 'models/phase0_xlmr_best_model.bin'

In [3]:
from src.scripts.data_preparation import prepare_dataframes

# Define constants
DATA_FOLDER = 'data'

# Prepare the dataframes
(
    train_df,
    val_df,
    test_df,
    id_to_label,
    label_to_id,
    parent_child_pairs,
) = prepare_dataframes(data_folder=DATA_FOLDER)

num_total_labels = len(id_to_label)

print(f"Number of training examples: {len(train_df)}")
print(f"Number of validation examples: {len(val_df)}")
print(f"Number of testing examples: {len(test_df)}")
print(f"Number of labels: {num_total_labels}")

  from .autonotebook import tqdm as notebook_tqdm


Loading annotations and taxonomy...
Mapping labels to IDs and creating binarized vectors...
Splitting dataset into train, validation, and test sets...
Dataset split sizes: Train=1005, Validation=331, Test=363
Number of training examples: 1005
Number of validation examples: 331
Number of testing examples: 363
Number of labels: 117
Mapping labels to IDs and creating binarized vectors...
Splitting dataset into train, validation, and test sets...
Dataset split sizes: Train=1005, Validation=331, Test=363
Number of training examples: 1005
Number of validation examples: 331
Number of testing examples: 363
Number of labels: 117


In [4]:
# --- Step 1: Analyze Class Distribution on the Training Set ---
from src.utils.metrics import get_class_distribution


print("--- Analyzing Training Set Class Distribution ---")
# Use the reusable function to get the counts
train_class_distribution = get_class_distribution(train_df, id_to_label)
# Display the rarest classes, which are your primary candidates for augmentation
print("\nTop 20 Rarest Classes in Training Data:")
print(train_class_distribution.head(20))

--- Analyzing Training Set Class Distribution ---
Calculating class distribution...
Class distribution calculation complete.

Top 20 Rarest Classes in Training Data:
                                                 label  count          level
99   CC: Green policies are geopolitical instrument...      1  Sub-narrative
113  CC: Downplaying climate change: Sea levels are...      1  Sub-narrative
86   CC: Questioning the measurements and science: ...      1  Sub-narrative
68   CC: Green policies are geopolitical instrument...      1  Sub-narrative
74   CC: Downplaying climate change: Weather sugges...      2  Sub-narrative
95   CC: Downplaying climate change: CO2 concentrat...      2  Sub-narrative
111  URW: Blaming the war on others rather than the...      2  Sub-narrative
93   URW: Praise of Russia: Russian invasion has st...      2  Sub-narrative
112  URW: Distrust towards Media: Ukrainian media c...      2  Sub-narrative
104  CC: Questioning the measurements and science: ...      2  S

In [5]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from src.data_management.datasets import NarrativeClassificationDataset
from transformers import AutoModelForSequenceClassification
import torch

MODEL_NAME = 'xlm-roberta-base'
MAX_LENGTH = 512
BATCH_SIZE = 16

print("Creating validation dataset and dataloader...")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_total_labels,
    problem_type='multi_label_classification',
    id2label=id_to_label,
    label2id=label_to_id
)

model.load_state_dict(torch.load(BEST_MODEL_CHECKPOINT_PATH))
model.to(device)

Creating validation dataset and dataloader...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [6]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

val_dataset = NarrativeClassificationDataset(val_df, tokenizer, max_length=MAX_LENGTH)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Validation dataset created with {len(val_dataset)} examples.")
print(f"Validation dataloader created with batch size {BATCH_SIZE}.")

Validation dataset created with 331 examples.
Validation dataloader created with batch size 16.


In [7]:
# --- Step 2: Analyze Per-Class F1 Scores on the Validation Set ---
from src.training.engine import get_raw_predictions, compute_metrics


print("\n--- Analyzing Per-Class F1 Scores on Validation Set ---")
# First, get the predictions and true labels from your validation set
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)


--- Analyzing Per-Class F1 Scores on Validation Set ---


Getting Raw Predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Getting Raw Predictions: 100%|██████████| 21/21 [00:05<00:00,  3.71it/s]
Getting Raw Predictions: 100%|██████████| 21/21 [00:05<00:00,  3.71it/s]


In [8]:

from src.utils.metrics import get_per_class_f1_scores

# --- Step 5: Get Per-Class F1 Scores with Optimal Thresholds ---
print("\n--- Calculating Per-Class F1 Scores with Best Thresholds ---")

# We need to identify which columns in our label tensors correspond to narratives vs. sub-narratives
narrative_indices = [i for i, label in id_to_label.items() if label.count(':') == 1]
subnarrative_indices = [i for i, label in id_to_label.items() if label.count(':') == 2]

# Use the dedicated function to get the F1 scores per class
per_class_f1_df = get_per_class_f1_scores(
    true_labels=val_true_labels,
    pred_logits=val_logits,
    id_to_label_map=id_to_label,
    narrative_indices=narrative_indices,
    subnarrative_indices=subnarrative_indices,
    narrative_threshold=NARRATIVE_THRESHOLD, # Using the constant from the top of the notebook
    subnarrative_threshold=SUBNARRATIVE_THRESHOLD # Using the constant from the top of the notebook
)

print("\nPer-Class F1 Scores (sorted by F1 score):")
# Display the full dataframe to see all classes
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(per_class_f1_df)



--- Calculating Per-Class F1 Scores with Best Thresholds ---
Calculating per-class F1 scores with per-level thresholds...
Probabilities Stats: Min=0.0255, Max=0.9744, Mean=0.3030
Narrative Threshold: 0.89, Sub-narrative Threshold: 0.8
Total number of positive predictions made (after thresholding): 2084
Per-class F1 score calculation complete.

Per-Class F1 Scores (sorted by F1 score):
                                                 label  f1_score
1                     CC: Climate change is beneficial  0.000000
2             CC: Controversy about green technologies  0.000000
6                       CC: Downplaying climate change  0.000000
7      CC: Green policies are geopolitical instruments  0.000000
10                                               Other  0.000000
15                         URW: Distrust towards Media  0.000000
9         CC: Questioning the measurements and science  0.000000
30   CC: Controversy about green technologies: Nucl...  0.000000
24   CC: Amplifying Climat

In [10]:
# --- Step 6: Identify Least Performing Narratives and Subnarratives (Bottom 30%) ---

# Separate narratives and subnarratives by label format
narratives_df = per_class_f1_df[per_class_f1_df['label'].apply(lambda x: x.count(':') == 1)]
subnarratives_df = per_class_f1_df[per_class_f1_df['label'].apply(lambda x: x.count(':') == 2)]

# Calculate bottom 30% count for each
narr_bottom_n = max(1, int(len(narratives_df) * 0.3))
subnarr_bottom_n = max(1, int(len(subnarratives_df) * 0.3))

least_perf_narratives = narratives_df.nsmallest(narr_bottom_n, 'f1_score')
least_perf_subnarratives = subnarratives_df.nsmallest(subnarr_bottom_n, 'f1_score')

print("\n--- Least Performing Narratives (Bottom 30%) ---")
print(least_perf_narratives[['label', 'f1_score']])

print("\n--- Least Performing Subnarratives (Bottom 30%) ---")
print(least_perf_subnarratives[['label', 'f1_score']])

# Save as CSV for easy inspection and re-use
least_perf_narratives.to_csv('least_perf_narratives.csv', index=False)
least_perf_subnarratives.to_csv('least_perf_subnarratives.csv', index=False)



--- Least Performing Narratives (Bottom 30%) ---
                                              label  f1_score
1                  CC: Climate change is beneficial       0.0
2          CC: Controversy about green technologies       0.0
6                    CC: Downplaying climate change       0.0
7   CC: Green policies are geopolitical instruments       0.0
15                      URW: Distrust towards Media       0.0
9      CC: Questioning the measurements and science       0.0

--- Least Performing Subnarratives (Bottom 30%) ---
                                                label  f1_score
30  CC: Controversy about green technologies: Nucl...       0.0
24  CC: Amplifying Climate Fears: Earth will be un...       0.0
26  CC: Amplifying Climate Fears: Whatever we do i...       0.0
27  CC: Climate change is beneficial: CO2 is benef...       0.0
28            CC: Climate change is beneficial: Other       0.0
31    CC: Controversy about green technologies: Other       0.0
29  CC: Climate