In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [2]:
from src.data_management.loaders import load_labeled_df

df = load_labeled_df('phase0_baseline_labeled.parquet')
df.head()

Unnamed: 0,id,text,narratives,subnarratives,language,narrative_ids,subnarrative_ids,num_narratives,num_subnarratives,word_count,word_count_bin,labels
0,BG_670.txt,Опитът на колективния Запад да „обезкърви Руси...,[URW: Blaming the war on others rather than th...,[URW: Blaming the war on others rather than th...,BG,"[11, 12, 14]","[88, 70, 74, 86]",4,4,248,101-250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, ..."
1,A7_URW_BG_4793.txt,"Цончо Ганев, “Възраждане”: Обещали сме на Укра...",[URW: Discrediting Ukraine],[URW: Discrediting Ukraine: Situation in Ukrai...,BG,[13],[81],1,1,503,501-1000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
2,BG_3245.txt,Подкрепата за Киев от страна на Запада вече не...,"[URW: Discrediting the West, Diplomacy, URW: D...","[URW: Discrediting the West, Diplomacy: The We...",BG,"[13, 14]","[81, 90, 87]",3,3,190,101-250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
3,A9_BG_5190.txt,"Дмитрий Медведев: НПО-та, спонсорирани от Соро...","[URW: Discrediting the West, Diplomacy, URW: D...","[URW: Discrediting the West, Diplomacy: Other,...",BG,"[13, 14]","[84, 86]",2,2,275,251-500,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
4,A9_BG_3379.txt,Британски дипломат обвини Запада за украинския...,"[URW: Discrediting the West, Diplomacy, URW: P...","[URW: Discrediting the West, Diplomacy: Other,...",BG,"[19, 14]","[86, 103]",2,2,237,101-250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


# Splitting the dataset into training and testing sets

In [3]:
from skmultilearn.model_selection import iterative_train_test_split

# Split the dataset into training and testing sets
X = df.index.to_numpy().reshape(-1, 1)
y = np.array(df['labels'].tolist())

train_val_indices, y_train_val, test_indices, y_test = iterative_train_test_split(X, y, test_size = 0.2)


train_indices, y_train, val_indices, y_val = iterative_train_test_split(train_val_indices, y_train_val, test_size = 0.25)

train_df = df.loc[train_indices.flatten()]
val_df = df.loc[val_indices.flatten()]
test_df = df.loc[test_indices.flatten()]


# 5. Verify the results
print("Original dataset shape:", df.shape)
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

print("\nExample of train_df head:")
print(train_df.head())

Original dataset shape: (1699, 12)
Train set shape: (992, 12)
Validation set shape: (337, 12)
Test set shape: (370, 12)

Example of train_df head:
                    id                                               text  \
1   A7_URW_BG_4793.txt  Цончо Ганев, “Възраждане”: Обещали сме на Укра...   
5   A7_URW_BG_3566.txt  Ответните мерки ще бъдат крайно болезнени за Е...   
6           BG_855.txt  Русия забрани разпространението на десетки мед...   
7           BG_751.txt  US военен: Путин ни изигра така, както Рейгън ...   
13          BG_573.txt  На 19 и 21 юни киевският режим извърши поредни...   

                                           narratives  \
1                         [URW: Discrediting Ukraine]   
5   [URW: Discrediting Ukraine, URW: Negative Cons...   
6   [URW: Distrust towards Media, URW: Distrust to...   
7   [URW: Praise of Russia, URW: Discrediting Ukra...   
13  [URW: Blaming the war on others rather than th...   

                                        subnarr

# Tokenizing the dataset

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

num_total_labels = df['labels'].iloc[0].shape[0]
print(f"Number of total labels: {num_total_labels}")

  from .autonotebook import tqdm as notebook_tqdm


Number of total labels: 117


In [5]:
from src.data_management.label_parser import get_label_mappings

label_to_id, id_to_label, narrative_to_subnarrative_ids = get_label_mappings()

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = num_total_labels,
    problem_type = 'multi_label_classification',
    id2label = id_to_label,
    label2id = label_to_id 
)

print("Model and tokenizer loaded successfully.")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully.


In [7]:
from src.data_management.datasets import NarrativeClassificationDataset

BATCH_SIZE = 16
MAX_LENGTH = 512

print("Creating PyTorch datasets...")
train_dataset = NarrativeClassificationDataset(
    train_df,
    tokenizer,
    max_length = MAX_LENGTH,
)

test_dataset = NarrativeClassificationDataset(
    test_df,
    tokenizer,
    max_length = MAX_LENGTH,
)


Creating PyTorch datasets...


In [8]:
from torch.utils.data import DataLoader
print("Creating DataLoaders...")

train_dataloader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 8,
    pin_memory = True,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = 8,
    pin_memory = True,
)

print("DataLoaders created successfully.")
for batch in train_dataloader:
    print(batch['input_ids'].shape) # Should be [BATCH_SIZE, MAX_TOKEN_LEN]
    print(batch['labels'].shape)    # Should be [BATCH_SIZE, num_total_labels]
    break

Creating DataLoaders...
DataLoaders created successfully.
torch.Size([16, 512])
torch.Size([16, 117])


In [9]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move your model to the selected device
model.to(device)

Using device: cuda


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [10]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
print("Optimizer created successfully.")

Optimizer created successfully.


In [15]:
from transformers.optimization import get_linear_schedule_with_warmup

EPOCH = 3
num_training_steps = len(train_dataloader) * EPOCH

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

print("Scheduler created successfully.")

Scheduler created successfully.


In [16]:
from sklearn.metrics import f1_score, roc_auc_score


MODEL_OUTPUT_PATH = 'models/phase0_baseline_model.pt'
if not os.path.exists('models'):
    os.makedirs('models')
    
def compute_f1_metrics(preds, labels, threshold=0.5):
    # preds are the raw logits, labels are the multi-hot encoded true values
    sigmoid_preds = 1 / (1 + np.exp(-preds)) # Apply sigmoid to convert logits to probabilities
    binary_preds = (sigmoid_preds > threshold).astype(int) # Apply threshold to get binary predictions
    
    f1_micro = f1_score(y_true=labels, y_pred=binary_preds, average='micro', zero_division=0)
    f1_macro = f1_score(y_true=labels, y_pred=binary_preds, average='macro', zero_division=0)
    
    roc_auc_micro = roc_auc_score(y_true=labels, y_score=sigmoid_preds, average='micro')
    roc_auc_macro = roc_auc_score(y_true=labels, y_score=sigmoid_preds, average='macro')

    return {"f1_micro": f1_micro, "f1_macro": f1_macro, "roc_auc_micro": roc_auc_micro, "roc_auc_macro": roc_auc_macro}

In [17]:
from tqdm.auto import tqdm

print("Starting the training process...")
best_f1_score = 0.0

for epoch in range(EPOCH):
    model.train()
    total_train_loss = 0.0
    
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        model.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    
    # --- Evaluation Phase ---
    print("Running evaluation on the test set...")
    model.eval() # Set the model to evaluation mode
    
    all_preds_logits = []
    all_true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            # Move batch data to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass to get logits
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Move logits and labels to CPU and convert to numpy arrays to accumulate them
            all_preds_logits.append(logits.cpu().numpy())
            all_true_labels.append(labels.cpu().numpy())

    # Concatenate all logits and labels
    all_preds_logits = np.concatenate(all_preds_logits, axis=0)
    all_true_labels = np.concatenate(all_true_labels, axis=0)
    sigmoid_preds = 1 / (1 + np.exp(-all_preds_logits))
    print(f"Sigmoid Preds Stats: Min={np.min(sigmoid_preds):.4f}, Max={np.max(sigmoid_preds):.4f}, Mean={np.mean(sigmoid_preds):.4f}")
    # Compute F1 metrics
    eval_metrics = compute_f1_metrics(all_preds_logits, all_true_labels)
    current_f1 = eval_metrics['f1_micro']
    if current_f1 > best_f1_score:
        best_f1_score = current_f1
        print(f"New best F1 score ({best_f1_score:.4f})! Saving model to {MODEL_OUTPUT_PATH}")
        torch.save(model.state_dict(), MODEL_OUTPUT_PATH)
        
print("\n--- Training Complete ---")
print(f"Best F1 Micro score achieved: {best_f1_score:.4f}")

Starting the training process...


Training: 100%|██████████| 62/62 [04:08<00:00,  4.01s/it]


Average Training Loss: 0.1304
Running evaluation on the test set...


Evaluating: 100%|██████████| 24/24 [00:58<00:00,  2.45s/it]


Sigmoid Preds Stats: Min=0.0120, Max=0.2251, Mean=0.0404


Training: 100%|██████████| 62/62 [04:10<00:00,  4.04s/it]


Average Training Loss: 0.1287
Running evaluation on the test set...


Evaluating: 100%|██████████| 24/24 [00:58<00:00,  2.45s/it]


Sigmoid Preds Stats: Min=0.0105, Max=0.2261, Mean=0.0392


Training: 100%|██████████| 62/62 [04:11<00:00,  4.05s/it]


Average Training Loss: 0.1282
Running evaluation on the test set...


Evaluating: 100%|██████████| 24/24 [00:59<00:00,  2.46s/it]

Sigmoid Preds Stats: Min=0.0100, Max=0.2232, Mean=0.0386

--- Training Complete ---
Best F1 Micro score achieved: 0.0000





In [14]:
f1_macro = eval_metrics['f1_macro']
print(f"F1 Macro score: {f1_macro:.4f}")

F1 Macro score: 0.0000
