In [None]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
import json
from tqdm.auto import tqdm

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Set the current working directory to the project root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(ROOT_DIR)

In [None]:
from src.scripts.data_preparation import prepare_dataframes

DATA_FOLDER = 'data'
DOCS_FOLDER = 'raw-documents'

# This function loads the data, splits it into train, val, and test sets, and returns them as pandas DataFrames
train_df, val_df, test_df, id_to_label, label_to_id, parent_child_pairs = prepare_dataframes(
    data_folder=DATA_FOLDER,
    docs_folder=DOCS_FOLDER
)

print(f"Train df: {train_df.shape}")
print(f"Val df: {val_df.shape}")
print(f"Test df: {test_df.shape}")

In [None]:
import json
import pandas as pd
from src.data_management.preprocessor import binarize_labels
from src.data_management.processing import process_dataframe_for_training

# Load the generated subnarrative texts
with open("generated_subnarrative_texts.json", "r", encoding="utf-8") as f:
    generated_subnarratives = json.load(f)

# Create a DataFrame from the generated data
rows = []
for item in generated_subnarratives:
    for text in item['generated_texts']:
        rows.append({
            'text': text,
            'subnarratives': [item['subnarrative']]
        })
new_data_df = pd.DataFrame(rows)

new_data_df['narratives'] = new_data_df['subnarratives'].apply(lambda x: [subn.split(':')[0] + ':' + subn.split(':')[1] for subn in x])

# Assuming label_to_id is defined somewhere in the code
# Get all possible ids from the label_to_id mapping
all_ids = list(id_to_label.keys())

# Process the new data to create binarized labels
new_data_df = process_dataframe_for_training(new_data_df, label_to_id, all_ids)
# Add article id and language columns
new_data_df = new_data_df.reset_index(drop=True)
new_data_df['id'] = new_data_df.index.map(lambda i: f"GEN_EN_{i:05d}.txt")
new_data_df['language'] = "EN"

# Concatenate the new generated data with the existing train_df
augmented_train_df = pd.concat([train_df, new_data_df], ignore_index=True)
print(f"Augmented train df: {augmented_train_df.shape}")

In [None]:
from src.training.setup import load_model_and_tokenizer
import torch

# --- Initialize Tokenizer and Load Best Model ---
MODEL_NAME = 'xlm-roberta-base'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

num_total_labels = len(id_to_label)

# Load the best model and tokenizer using the helper function
model, tokenizer = load_model_and_tokenizer(
    model_name=MODEL_NAME,
    device=device,
    num_total_labels=num_total_labels,
    id_to_label=id_to_label,
    label_to_id=label_to_id
)

In [None]:
from src.data_management.datasets import NarrativeClassificationDataset
from torch.utils.data import DataLoader

# --- Create PyTorch Datasets ---
MODEL_NAME = 'xlm-roberta-base'
MAX_LENGTH = 512
BATCH_SIZE = 16



augmented_train_dataset = NarrativeClassificationDataset(
    augmented_train_df,
    tokenizer,
    max_length=MAX_LENGTH
)

val_dataset = NarrativeClassificationDataset(
    val_df,
    tokenizer,
    max_length=MAX_LENGTH
)

# --- Create PyTorch DataLoaders ---
train_dataloader = DataLoader(
    augmented_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

print(f"Train dataloader size: {len(train_dataloader)}")
print(f"Validation dataloader size: {len(val_dataloader)}")

In [None]:
# --- Compute pos_weight for BCEWithLogitsLoss ---
# pos_weight = (num_negative / num_positive) for each class
labels = np.stack(augmented_train_df['labels'].values)
num_pos = labels.sum(axis=0)
num_neg = labels.shape[0] - num_pos
# Avoid division by zero
pos_weight = torch.tensor(num_neg / (num_pos + 1e-8), dtype=torch.float32, device=device)
print(f"pos_weight shape: {pos_weight.shape}")

In [None]:
import torch
from tqdm.auto import tqdm
import numpy as np
import os
import torch.nn as nn
from src.training.engine import train_epoch, evaluate
from src.training.setup import setup_optimizer_and_scheduler

# --- Training Setup ---
EPOCHS = 20
LEARNING_RATE = 2e-5
MODEL_SAVE_PATH = 'models/phase1_xlmr_augmented_best_model.bin'
PATIENCE = 3
H_LAMBDA = 1.5 # Hierarchical loss penalty

optimizer, scheduler = setup_optimizer_and_scheduler(
    model,
    train_dataloader,
    EPOCHS,
    LEARNING_RATE
)

# Use pos_weight in the loss function
loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

best_val_loss = float('inf')
epochs_no_improve = 0
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)


for epoch in tqdm(range(EPOCHS), desc="Epochs"):
    print(f"--- Epoch {epoch+1}/{EPOCHS} ---")

    # --- Training Phase ---
    train_loss = train_epoch(
        model=model,
        train_dataloader= train_dataloader,
        optimizer=optimizer,
        scheduler=scheduler,
        loss_function=loss_function,
        device=device,
        H_LAMBDA=H_LAMBDA,
        parent_child_pairs=parent_child_pairs,
    )
    
    
    print(f"Average Training Loss: {train_loss:.4f}")

    # --- Validation Phase ---
    val_loss, metrics = evaluate(
        model=model,
        eval_dataloader=val_dataloader,
        loss_function=loss_function,
        device=device,
        H_LAMBDA=H_LAMBDA,
        parent_child_pairs=parent_child_pairs,
        threshold=0.5
    )


    print(f"Average Validation Loss: {val_loss:.4f}")
    print(f"Validation F1-score (micro): {metrics['f1_micro']:.4f}")


    # --- Early Stopping ---
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Validation loss improved. Saved new best model to {MODEL_SAVE_PATH}")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        print(f"Validation loss did not improve for {epochs_no_improve} epoch(s).")
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping triggered after {epoch+1} epochs.")
            break
        
print("Training complete. Best model saved.")


In [None]:
# finding the best threshold for the validation set

import torch
import numpy as np
from src.training.engine import get_raw_predictions, compute_metrics
from src.utils.metrics import find_per_level_thresholds
from src.data_management.datasets import NarrativeClassificationDataset
from torch.utils.data import DataLoader


# --- Load Best Model ---
print("\n--- Loading best model for threshold finding ---")
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.to(device)

# --- Get Predictions on Validation Set ---
print("Getting raw predictions from the validation set...")
val_logits, val_true_labels = get_raw_predictions(model, val_dataloader, device)
print("Raw predictions obtained.")

# --- Find Optimal Per-Level Thresholds ---
print("\n--- Finding the optimal per-level thresholds ---")
val_probs = 1 / (1 + np.exp(-val_logits))

# Identify narrative and subnarrative indices
narrative_indices = [i for i, label in id_to_label.items() if label.count(":") == 1]
subnarrative_indices = [i for i, label in id_to_label.items() if label.count(":") == 2]

thresholds_result = find_per_level_thresholds(
    probabilities=val_probs,
    true_labels=val_true_labels,
    narrative_indices=narrative_indices,
    subnarrative_indices=subnarrative_indices,
    parent_child_pairs=parent_child_pairs
)

narrative_threshold = thresholds_result["narrative_threshold"]
subnarrative_threshold = thresholds_result["subnarrative_threshold"]
print(f"\nOptimal narrative threshold: {narrative_threshold:.4f}")
print(f"Optimal subnarrative threshold: {subnarrative_threshold:.4f}")

# --- Final Evaluation on the UNSEEN TEST SET ---
print("\n--- Final Evaluation on TEST set using the Optimal Per-Level Thresholds ---")

test_dataset = NarrativeClassificationDataset(
    test_df,
    tokenizer,
    max_length=MAX_LENGTH
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Get raw predictions for the test set
test_logits, test_true_labels = get_raw_predictions(model, test_dataloader, device)
test_probs = 1 / (1 + np.exp(-test_logits))

# Apply per-level thresholds to get binary predictions
binary_preds = np.zeros_like(test_probs, dtype=int)
binary_preds[:, narrative_indices] = (test_probs[:, narrative_indices] > narrative_threshold).astype(int)
binary_preds[:, subnarrative_indices] = (test_probs[:, subnarrative_indices] > subnarrative_threshold).astype(int)

# Optionally, apply hierarchical correction if needed
if parent_child_pairs:
    for sub_id, narr_id in parent_child_pairs:
        inconsistent_mask = (binary_preds[:, sub_id] == 1) & (binary_preds[:, narr_id] == 0)
        binary_preds[inconsistent_mask, sub_id] = 0

# Calculate final metrics using the per-level thresholds
from sklearn.metrics import f1_score
f1_micro = f1_score(test_true_labels, binary_preds, average='micro', zero_division=0)
f1_macro = f1_score(test_true_labels, binary_preds, average='macro', zero_division=0)

print(f"\nFinal Reportable Performance on Test Set:")
print(f"  - F1 Micro: {f1_micro:.4f}")
print(f"  - F1 Macro: {f1_macro:.4f}")

In [None]:
MODEL_PATH = 'models/phase1_xlmr_augmented_best_model.bin'
TOKENIZER_NAME = 'xlm-roberta-base'
TEST_ARTICLES_PATH = 'testset/EN/subtask-2-documents/'
OUTPUT_FILE = 'testset/en_predictions_augmented.txt'
OPTIMAL_THRESHOLD = 0.71

In [None]:
def load_articles(folder_path):
    """Loads all .txt files from a folder."""
    articles = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                articles.append({'article_id': filename, 'text': f.read()})
    return pd.DataFrame(articles)

In [None]:
from src.inference.narrative_predictor import NarrativePredictor

# Prepare label maps for the predictor
label_maps = {
    "id2label": id_to_label,
    "label2id": label_to_id,
    "parent_child_pairs": parent_child_pairs
}

TOKENIZER_NAME = 'xlm-roberta-base'

# Initialize the predictor with the new model
predictor = NarrativePredictor(MODEL_SAVE_PATH, TOKENIZER_NAME, label_maps)

# Set optimal thresholds if available
predictor.set_thresholds(0.71, 0.69)

print(f"Loading articles from {TEST_ARTICLES_PATH}...")
df_test = load_articles(TEST_ARTICLES_PATH)
texts_to_predict = df_test['text'].tolist()

In [None]:
from src.utils.prediction_output import write_predictions_to_txt

# Use the helper function to run predictions and write to .txt file
write_predictions_to_txt(predictor, df_test, OUTPUT_FILE)

print(f"Predictions written to {OUTPUT_FILE}")