In [1]:
import os, re
from collections import defaultdict
from typing import List, Tuple, Dict, Set
from pathlib import Path # Added import

def read_article(path: str) -> str:
    with open(path, "rb") as f:
        raw = f.read()
    return raw.decode("utf-8", "ignore")

def load_span_labels(label_file: str
    ) -> Dict[str, List[Tuple[str, int, int]]]:
    
    spans = defaultdict(list)
    with open(label_file, encoding="utf-8") as f:
        for line in f:
            # Handle potential empty lines or lines with incorrect format
            parts = line.rstrip().split("\t")
            if len(parts) == 4:
                art_id, lab, s, e = parts
                try:
                    spans[art_id].append((lab, int(s), int(e)))
                except ValueError:
                    print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
            elif line.strip(): # Print warning for non-empty, but malformed lines
                 print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
    return spans

# Added function to extract base classes from the span file
def get_base_classes_from_spans(label_file: str) -> Set[str]:
    base_classes = set()
    with open(label_file, encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split("\t")
            if len(parts) == 4:
                _, lab, _, _ = parts
                base_classes.add(lab)
            elif line.strip():
                 # Warnings handled in load_span_labels, no need to repeat here
                 pass
    return base_classes

def build_label_maps(
    base_classes: set[str] # Changed input to accept a set of base classes
) -> tuple[list[str], dict[str, int], dict[int, str]]:
    """
    Input
    -----
    base_classes – A set containing all unique base label names (e.g., "Appeal_to_Fear-Prejudice")

    Output
    ------
    bio_tags   – full list like ["O", "B-Appeal_to_Fear-Prejudice", "I-Appeal_to_Fear-Prejudice", …]
    label2id   – {"O": 0, "B-…": 1, …}   – used to turn tags into numbers
    id2label   – inverse of label2id      – needed by the model/Trainer
    """
    # 1️⃣ Use the provided base classes
    sorted_base_classes = sorted(list(base_classes))

    # 2️⃣ build BIO strings
    bio_tags = ["O"]                      # outside any span
    for cls in sorted_base_classes:
        bio_tags.extend([f"B-{cls}", f"I-{cls}"])   # beginning / inside

    # 3️⃣ numeric maps
    label2id = {tag: i for i, tag in enumerate(bio_tags)}
    id2label = {i: tag for tag, i in label2id.items()}

    return bio_tags, label2id, id2label

In [2]:
from transformers import XLMRobertaTokenizerFast

# Load the pre-trained XLM-RoBERTa tokenizer
# You can replace "xlm-roberta-base" with a specific model if needed
tokenizer_name = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(tokenizer_name)

def tokenize_text(text: str):
    """
    Tokenizes the input text using the pre-loaded XLMRobertaTokenizerFast.

    Args:
        text: The input string to tokenize.

    Returns:
        A dictionary containing the tokenized 'input_ids', 'attention_mask', etc.
    """
    # Tokenize the text, adding common options like truncation and max_length
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,  # You can adjust the max_length if needed
        return_tensors="pt", # Return PyTorch tensors, change to "tf" for TensorFlow if required
        return_offsets_mapping=True, # Useful for aligning tokens with original text
    )
    return tokenized_inputs

# Example of how to use the function (optional)
sample_text = "Here is some text to tokenize."
tokenized_result = tokenize_text(sample_text)
print(tokenized_result)

  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([[    0, 11853,    83,  3060,  7986,    47,    47,  1098, 20650,     5,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'offset_mapping': tensor([[[ 0,  0],
         [ 0,  4],
         [ 5,  7],
         [ 8, 12],
         [13, 17],
         [18, 20],
         [21, 23],
         [23, 26],
         [26, 29],
         [29, 30],
         [ 0,  0]]])}


In [3]:
from transformers.tokenization_utils_base import BatchEncoding

def align_labels_with_tokens(
    tokenized_inputs: BatchEncoding,
    spans: List[Tuple[str, int, int]]
) -> List[str]:
    """
    Aligns character-level span labels with token-level BIO labels.
    Handles cases where token spans partially overlap with label spans.

    Args:
        tokenized_inputs: Output from the tokenizer (must include 'offset_mapping').
        spans: A list of tuples, where each tuple is (label, start_char, end_char).

    Returns:
        A list of BIO labels (e.g., "O", "B-Label", "I-Label") corresponding to each token.
    """
    # Ensure offset mapping is present
    if 'offset_mapping' not in tokenized_inputs:
        raise ValueError("Tokenizer output must include 'offset_mapping'.")

    # offset_mapping is typically shape (batch_size, sequence_length, 2)
    # Assuming batch_size is 1 for this function
    # Removed .tolist() as the input might already be a list
    offsets = tokenized_inputs['offset_mapping'][0]
    num_tokens = len(offsets)
    labels = ["O"] * num_tokens # Initialize all labels as Outside

    # Sort spans by start index to handle potential overlaps consistently (optional but good practice)
    # spans.sort(key=lambda x: x[1]) # Uncomment if sorting is desired

    for label, start_char, end_char in spans:
        found_first_token = False
        for token_idx, (token_start, token_end) in enumerate(offsets):
            # Skip special tokens (like [CLS], [SEP]) which have (0, 0) offset
            if token_start == token_end:
                continue

            # Check for overlap between token span and label span
            # This condition is true if there is *any* overlap, including partial overlaps.
            # max(start1, start2) < min(end1, end2)
            if max(token_start, start_char) < min(token_end, end_char):
                # Assign B- tag to the first token overlapping the span
                if not found_first_token:
                    labels[token_idx] = f"B-{label}" # Begin label
                    found_first_token = True
                # Assign I- tag to subsequent tokens overlapping the *same* span
                else:
                    labels[token_idx] = f"I-{label}" # Inside label
            # Optimization: If the token starts after the span ends,
            # we don't need to check further for this span.
            # elif token_start >= end_char:
            #     break # Uncomment if spans are sorted by start_char

    return labels

# Example Usage (using variables from previous cells if run in the same notebook)

# Sample spans (replace with actual spans for your data)
sample_spans = [
    ("LOC", 8, 12),   # Corresponds to "some" in "Here is some text to tokenize."
    ("VERB", 21, 29) # Corresponds to "tokenize"
]

# Use the previously tokenized result
aligned_labels = align_labels_with_tokens(tokenized_result, sample_spans)

# Print tokens and their corresponding labels
tokens = tokenizer.convert_ids_to_tokens(tokenized_result['input_ids'][0])

print("Tokens:", tokens)
print("Labels:", aligned_labels)

# Verify alignment (optional)
for token, label, offset in zip(tokens, aligned_labels, tokenized_result['offset_mapping'][0].tolist()):
    print(f"{token:<15} {label:<15} {offset}")

Tokens: ['<s>', '▁Here', '▁is', '▁some', '▁text', '▁to', '▁to', 'ken', 'ize', '.', '</s>']
Labels: ['O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-VERB', 'I-VERB', 'I-VERB', 'O', 'O']
<s>             O               [0, 0]
▁Here           O               [0, 4]
▁is             O               [5, 7]
▁some           B-LOC           [8, 12]
▁text           O               [13, 17]
▁to             O               [18, 20]
▁to             B-VERB          [21, 23]
ken             I-VERB          [23, 26]
ize             I-VERB          [26, 29]
.               O               [29, 30]
</s>            O               [0, 0]


In [4]:
def create_token_label_mapping(
    span_sources: List[Tuple[str, str]], # List of (prefix, label_file_path)
    article_dir: str,
    tokenizer: XLMRobertaTokenizerFast,
    max_length: int = 512
) -> Tuple[Dict[str, Dict[str, List[int]]], Dict[str, List[int]], Dict[str, int], Dict[int, str]]:
    """
    Creates a mapping from article ID to tokenized inputs and numerical BIO labels
    by processing multiple language sources.

    Args:
        span_sources: List of tuples, each containing (language_prefix, span_label_file_path).
                      Example: [("", "path/ru.txt"), ("en_", "path/en.txt"), ("fr_", "path/fr.txt")]
        article_dir: Path to the single directory containing all article text files.
                     Article filenames are expected to be like article{id}.txt or {prefix}article{id}.txt
        tokenizer: An initialized Hugging Face tokenizer (e.g., XLMRobertaTokenizerFast).
        max_length: Maximum sequence length for tokenization.

    Returns:
        A tuple containing:
        - tokenized_articles: Dict mapping unique_article_id to tokenized inputs (Dict).
        - article_label_ids: Dict mapping unique_article_id to list of numerical label IDs.
        - label2id: Dictionary mapping BIO label strings to integer IDs.
        - id2label: Dictionary mapping integer IDs to BIO label strings.
    """
    all_spans_by_article = {}
    all_base_classes = set()

    print("Loading spans and base classes from all sources...")
    for prefix, span_label_file in span_sources:
        print(f"  Processing source: prefix='{prefix}', file='{span_label_file}'")
        if not Path(span_label_file).exists():
            print(f"  Warning: Label file not found, skipping: {span_label_file}")
            continue
        current_spans = load_span_labels(span_label_file)
        current_base_classes = get_base_classes_from_spans(span_label_file)
        all_base_classes.update(current_base_classes)
        print(f"    Found {len(current_spans)} articles and {len(current_base_classes)} base classes.")
        # Create unique IDs like "en_12345" or just "12345" if prefix is empty
        for art_id, spans in current_spans.items():
            unique_art_id = f"{prefix}{art_id}"
            if unique_art_id in all_spans_by_article:
                 print(f"    Warning: Duplicate article ID found: {unique_art_id}. Overwriting spans.")
            all_spans_by_article[unique_art_id] = spans
    
    total_articles = len(all_spans_by_article)
    print(f"\nTotal unique articles found across all sources: {total_articles}")
    print(f"Total unique base classes found: {len(all_base_classes)}")

    if not all_spans_by_article:
        print("Error: No spans loaded. Cannot proceed.")
        return {}, {}, {}, {}

    print("Building label maps...")
    bio_tags, label2id, id2label = build_label_maps(all_base_classes)
    print(f"Total BIO tags: {len(bio_tags)}")

    tokenized_articles = {}
    article_label_ids = {}
    article_dir_path = Path(article_dir)

    print(f"\nProcessing articles from: {article_dir}")
    processed_count = 0
    skipped_count = 0
    for unique_art_id, spans in all_spans_by_article.items():
        processed_count += 1
        if processed_count % 100 == 0: # Print progress
             print(f"  Processing article {processed_count}/{total_articles} (ID: {unique_art_id})...")

        # Construct article filename based on the unique ID
        # Check if unique_art_id contains a prefix (e.g., "en_")
        if '_' in unique_art_id:
            prefix, actual_id = unique_art_id.split('_', 1)
            article_filename = f"{prefix}_article{actual_id}.txt"
        else:
            # No prefix, assume it's Russian or similar format
            article_filename = f"article{unique_art_id}.txt"
        
        article_path = article_dir_path / article_filename

        if not article_path.exists():
            print(f"  Warning: Article file not found, skipping: {article_path}")
            skipped_count += 1
            continue

        text = read_article(str(article_path))

        # Tokenize
        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            return_offsets_mapping=True,
            # No return_tensors="pt" here, keep as lists for now
        )

        # Align character spans to token BIO labels
        # Wrap tokenized_inputs for align_labels_with_tokens
        temp_batch_encoding = BatchEncoding({
            'input_ids': [tokenized_inputs['input_ids']],
            'attention_mask': [tokenized_inputs['attention_mask']],
            'offset_mapping': [tokenized_inputs['offset_mapping']]
        })
        bio_labels = align_labels_with_tokens(temp_batch_encoding, spans)

        # Convert BIO labels to numerical IDs
        label_ids = [label2id.get(label, label2id["O"]) for label in bio_labels]

        # Store results (remove offset mapping if not needed later)
        tokenized_inputs.pop("offset_mapping")
        tokenized_articles[unique_art_id] = tokenized_inputs
        article_label_ids[unique_art_id] = label_ids

    print(f"\nFinished processing. Processed: {processed_count - skipped_count}, Skipped (missing files): {skipped_count}.")
    return tokenized_articles, article_label_ids, label2id, id2label

# --- Example Usage ---

# Define paths (adjust if necessary)
base_data_path = Path("/home/twoface/persuasion-detection/data/processed")
articles_path = base_data_path / "ru/unwrapped-articles" # All articles are here

span_sources_to_process = [
    ("", base_data_path / "ru/train-labels-subtask-3-spans.txt"), # Russian (no prefix)
    ("en_", base_data_path / "ru/train-labels-subtask-3-spans-en.txt"), # English
    ("fr_", base_data_path / "ru/train-labels-subtask-3-spans-fr.txt"),  # French
    ("ge_", base_data_path / "ru/train-labels-subtask-3-spans-ge.txt"), # German
    ("it_", base_data_path / "ru/train-labels-subtask-3-spans-it.txt"), # Italian
]

# Convert Path objects to strings for the function
span_sources_str = [(prefix, str(path)) for prefix, path in span_sources_to_process]
articles_path_str = str(articles_path)

# Ensure the tokenizer is loaded (from the second cell)
if 'tokenizer' not in locals():
    print("Tokenizer not found, please run the tokenizer loading cell first.")
else:
    # Call the function with the list of sources
    tokenized_data, label_data, l2i, i2l = create_token_label_mapping(
        span_sources_str,
        articles_path_str,
        tokenizer
    )

    # Display results for a sample article (if data was processed)
    if tokenized_data:
        sample_id = list(tokenized_data.keys())[0]
        print(f"\n--- Sample Article ID: {sample_id} ---")
        print("Tokenized Input Keys:", tokenized_data[sample_id].keys())
        print("Number of Tokens:", len(tokenized_data[sample_id]['input_ids']))
        print("Number of Labels:", len(label_data[sample_id]))
        print("Label2ID mapping (sample):", list(l2i.items())[:5])
        print("ID2Label mapping (sample):", list(i2l.items())[:5])
    else:
        print("\nNo articles were processed. Check paths and file existence.")

Loading spans and base classes from all sources...
  Processing source: prefix='', file='/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans.txt'
    Found 190 articles and 23 base classes.
  Processing source: prefix='en_', file='/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans-en.txt'
    Found 504 articles and 19 base classes.
  Processing source: prefix='fr_', file='/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans-fr.txt'
    Found 209 articles and 23 base classes.
  Processing source: prefix='ge_', file='/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans-ge.txt'
    Found 176 articles and 23 base classes.
  Processing source: prefix='it_', file='/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans-it.txt'
    Found 302 articles and 23 base classes.

Total unique articles found across all sources: 1381
Total unique base classes 

In [5]:
from datasets import Dataset
from typing import Dict, List
from transformers.tokenization_utils_base import BatchEncoding # Already imported but good practice

def create_hf_dataset(
    tokenized_articles: Dict[str, Dict[str, List[int]]], # Adjusted type hint
    article_label_ids: Dict[str, List[int]],
    label2id: Dict[str, int] # Keep label2id for potential future use or validation
) -> Dataset:
    """
    Creates a Hugging Face Dataset object from tokenized articles and labels.

    Args:
        tokenized_articles: Dict mapping article_id to tokenized inputs (Dict with 'input_ids', 'attention_mask').
        article_label_ids: Dict mapping article_id to list of numerical label IDs.
        label2id: Dictionary mapping BIO label strings to integer IDs (optional, could be used for validation).

    Returns:
        A Hugging Face Dataset object with columns 'input_ids', 'attention_mask', 'labels'.
    """
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    print(f"Preparing dataset from {len(tokenized_articles)} articles...")
    skipped_count = 0
    # Sort article IDs for deterministic dataset creation (optional but good practice)
    sorted_art_ids = sorted(tokenized_articles.keys())

    for art_id in sorted_art_ids:
        if art_id not in article_label_ids:
            print(f"Warning: Labels not found for article {art_id}. Skipping.")
            skipped_count += 1
            continue

        input_ids = tokenized_articles[art_id]['input_ids']
        attention_mask = tokenized_articles[art_id]['attention_mask']
        labels = article_label_ids[art_id]

        # Sanity check: lengths must match
        if not (len(input_ids) == len(attention_mask) == len(labels)):
            print(f"Warning: Length mismatch for article {art_id}. ",
                  f"input_ids: {len(input_ids)}, attention_mask: {len(attention_mask)}, labels: {len(labels)}. Skipping.")
            skipped_count += 1
            continue

        all_input_ids.append(input_ids)
        all_attention_masks.append(attention_mask)
        all_labels.append(labels)

    print(f"Finished preparing dataset. Total articles processed: {len(all_input_ids)}. Skipped: {skipped_count}.")

    if not all_input_ids: # Handle case where no data was processed
        print("Error: No valid data found to create dataset.")
        return None

    dataset_dict = {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    }

    # Create the Dataset object
    hf_dataset = Dataset.from_dict(dataset_dict)
    return hf_dataset

# --- Example Usage ---

# Check if the required variables exist from the previous cell
if 'tokenized_data' in locals() and 'label_data' in locals() and 'l2i' in locals():
    print("\nCreating Hugging Face dataset...")
    # Ensure the data is not empty before proceeding
    if tokenized_data and label_data:
        # Rename to combined_dataset or similar to reflect augmented data
        combined_dataset = create_hf_dataset(tokenized_data, label_data, l2i)

        if combined_dataset:
            print("\nDataset created successfully!")
            print(combined_dataset)
            # You can inspect the first example:
            # print("\nFirst example:", combined_dataset[0]) # Avoid printing large examples by default
        else:
            print("\nDataset creation failed.")
    else:
        print("\nCannot create dataset: 'tokenized_data' or 'label_data' is empty.")
else:
    print("\nPlease run the previous cells to generate 'tokenized_data', 'label_data', and 'l2i'.")


Creating Hugging Face dataset...
Preparing dataset from 1381 articles...
Finished preparing dataset. Total articles processed: 1381. Skipped: 0.

Dataset created successfully!
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1381
})


In [6]:
# Inspect the created dataset
# Use the new variable name 'combined_dataset'
if 'combined_dataset' in locals() and combined_dataset is not None and len(combined_dataset) > 0:
    print("--- Inspecting the combined dataset ---")
    print(combined_dataset)
    print("\n--- Inspecting the first example (combined_dataset[0]) ---")
    first_example = combined_dataset[0]
    # print(first_example) # Avoid printing potentially large example
    print("Keys:", first_example.keys())
    print("Number of tokens:", len(first_example['input_ids']))
    print("Number of labels:", len(first_example['labels']))
    print("Number of attention mask values:", len(first_example['attention_mask']))

    # Slicing returns a dictionary where each key maps to a list of values for that slice
    # first_three_examples = combined_dataset[:3]
    # print("\n--- Inspecting the first 3 examples (combined_dataset[:3]) ---")
    # print("Keys:", first_three_examples.keys())
    # print("Number of examples in slice:", len(first_three_examples['input_ids'])) # Should be 3
elif 'combined_dataset' in locals() and combined_dataset is not None:
     print("The combined_dataset is empty.")
else:
    print("Variable 'combined_dataset' not found or is None. Please run the previous cell to create it.")

--- Inspecting the combined dataset ---
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1381
})

--- Inspecting the first example (combined_dataset[0]) ---
Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Number of tokens: 512
Number of labels: 512
Number of attention mask values: 512


## Model Training Setup

Now we set up the components needed for training:
1.  **Load Model**: Load `XLMRobertaForTokenClassification` with the correct number of labels.
2.  **Training Arguments**: Configure hyperparameters like learning rate, batch size, epochs, and output directories.
3.  **Metrics**: Define a function to compute evaluation metrics (precision, recall, F1) using `seqeval`.
4.  **Data Collator**: Use `DataCollatorForTokenClassification` for dynamic padding.
5.  **Trainer**: Initialize the `Trainer`.

In [7]:
# Split the combined dataset (e.g., 90% train, 10% evaluation)
# Use shuffle=True (default) and a seed for reproducibility
split_dataset = combined_dataset.train_test_split(test_size=0.1, seed=42)

# Assign the splits to new variables
train_split = split_dataset['train']
eval_split = split_dataset['test'] # The 'test' key holds the evaluation split

print(f"Original combined dataset size: {len(combined_dataset)}")
print(f"Train split size: {len(train_split)}")
print(f"Evaluation split size: {len(eval_split)}")

# Display the structure of the splits
print("\nTrain split structure:")
print(train_split)
print("\nEvaluation split structure:")
print(eval_split)

Original combined dataset size: 1381
Train split size: 1242
Evaluation split size: 139

Train split structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1242
})

Evaluation split structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 139
})


## Addressing Zero F1 Score (Model Predicting Only "O") - Attempt 2: Class Weights

Increasing epochs and adjusting the learning rate did not resolve the zero F1 score, indicating the model still struggles to identify minority classes (B- and I- tags) due to the high prevalence of the 'O' tag (data imbalance).

**Next Adjustment: Class Weighting**

We will introduce class weights into the loss function. This assigns a higher penalty when the model misclassifies less frequent classes, forcing it to pay more attention to them.

Steps:
1.  **Calculate Weights:** Compute weights inversely proportional to class frequencies in the training set.
2.  **Custom Trainer:** Create a `WeightedLossTrainer` by subclassing `Trainer`.
3.  **Override Loss:** Modify the `compute_loss` method in the custom trainer to use `torch.nn.CrossEntropyLoss` with the calculated weights.
4.  **Train:** Use this custom trainer for the next training run.

In [8]:
import torch
from collections import Counter
import numpy as np

# Calculate class weights based on the training split
print("Calculating class weights...")

# Flatten all labels in the training set, ignoring padding (-100)
# This list contains the numerical IDs for every token's label (O, B-Tag1, I-Tag1, B-Tag2, etc.)
all_labels_flat = [label for sublist in train_split['labels'] for label in sublist if label != -100]

if not all_labels_flat:
    print("Warning: No valid labels found in train_split to calculate weights. Using uniform weights.")
    num_classes = len(l2i) # l2i maps BIO tags to IDs
    class_weights_tensor = torch.ones(num_classes)
else:
    total_tokens = len(all_labels_flat)
    num_classes = len(l2i) # Total number of unique BIO tags (O + B-Class1 + I-Class1 + ...)
    # Count the occurrences of each specific BIO tag ID
    label_counts = Counter(all_labels_flat)

    print(f"Total non-padding tokens in training set: {total_tokens}")
    print(f"Number of classes (unique BIO tags): {num_classes}")
    # print(f"Label counts (sample): {dict(list(label_counts.items())[:10])}") # Optional: view counts

    # Calculate weights inversely proportional to the frequency of each specific BIO tag.
    # Formula: total_tokens / (num_classes * count_of_specific_tag)
    # This inherently handles:
    #   1. O vs B/I imbalance: 'O' (ID 0) is usually most frequent, getting the lowest weight.
    #   2. Base class imbalance: A frequent class like 'Loaded_Language' will result in
    #      higher counts for its B/I tags compared to a rare class, thus B/I tags
    #      for 'Loaded_Language' will get lower weights than B/I tags for the rare class.
    # Using label_counts.get(i, 1) handles classes potentially not present in the training split (assigns high weight).
    weights = [total_tokens / (num_classes * label_counts.get(i, 1)) for i in range(num_classes)]

    # Convert to tensor
    class_weights_tensor = torch.tensor(weights, dtype=torch.float)

    print(f"Calculated class weights tensor (first 5): {class_weights_tensor[:5]}")
    print(f"Weight for 'O' (ID {l2i.get('O', -1)}): {class_weights_tensor[l2i.get('O', 0)]:.4f}") # Use l2i to find O's ID
    # Example: Print weight for the first B-tag if available
    first_b_tag = next((tag for tag in l2i if tag.startswith('B-')), None)
    if first_b_tag:
        first_b_id = l2i[first_b_tag]
        print(f"Weight for '{first_b_tag}' (ID {first_b_id}): {class_weights_tensor[first_b_id]:.4f}")
    # Example: Print weight for the first I-tag if available
    first_i_tag = next((tag for tag in l2i if tag.startswith('I-')), None)
    if first_i_tag:
        first_i_id = l2i[first_i_tag]
        print(f"Weight for '{first_i_tag}' (ID {first_i_id}): {class_weights_tensor[first_i_id]:.4f}")

# Ensure the tensor is available
if 'class_weights_tensor' not in locals():
   raise RuntimeError("class_weights_tensor was not calculated.")

Calculating class weights...
Total non-padding tokens in training set: 597276
Number of classes (unique BIO tags): 47
Calculated class weights tensor (first 5): tensor([2.9324e-02, 3.8509e+01, 1.8067e+00, 2.0464e+01, 1.1635e+00])
Weight for 'O' (ID 0): 0.0293
Weight for 'B-Appeal_to_Authority' (ID 1): 38.5091
Weight for 'I-Appeal_to_Authority' (ID 2): 1.8067
Total non-padding tokens in training set: 597276
Number of classes (unique BIO tags): 47
Calculated class weights tensor (first 5): tensor([2.9324e-02, 3.8509e+01, 1.8067e+00, 2.0464e+01, 1.1635e+00])
Weight for 'O' (ID 0): 0.0293
Weight for 'B-Appeal_to_Authority' (ID 1): 38.5091
Weight for 'I-Appeal_to_Authority' (ID 2): 1.8067


In [9]:
from transformers import XLMRobertaForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate # Use evaluate instead of load_metric
import torch # Ensure torch is imported

# Define the Custom Trainer with Weighted Loss
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    # Modified signature to accept **kwargs
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Define loss function with weights
        # Ensure weights are on the same device as the model
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        # Compute loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

print("WeightedLossTrainer class defined.")

print("Setting up training components with weighted loss...")
model = XLMRobertaForTokenClassification.from_pretrained(
    tokenizer_name,
    id2label=i2l,
    label2id=l2i
)
print(f"Model re-loaded: {tokenizer_name} with {model.config.num_labels} labels.")
# Or reuse the existing model variable if desired
# print(f"Using existing model: {tokenizer_name} with {model.config.num_labels} labels.") # Commented out reuse message

# 2. Training Arguments
training_args = TrainingArguments(
    output_dir="./results_combined_weighted_v2", # Changed output dir again
    eval_strategy="epoch",
    learning_rate=2e-5, # Increased learning rate slightly
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10, # Increased epochs
    weight_decay=0.01,
    logging_dir='./logs_combined_weighted_v2', # Changed logging dir again
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)
print("TrainingArguments defined for weighted loss run (v2).")

# 3. Metrics Calculation
metric = evaluate.load("seqeval")
label_list = list(i2l.values()) # Use i2l from the previous cell

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) and convert indices to labels
    true_predictions_filtered = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels_filtered = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # --- Debugging Print ---
    # Print the first few examples to see what's being passed to seqeval
    # print("\n--- Debugging compute_metrics ---") # Keep commented out unless debugging
    # num_examples_to_print = 1 # Reduced for brevity
    # for i in range(min(num_examples_to_print, len(true_labels_filtered))):
    #     print(f"Example {i+1} (first 50 labels/preds shown):")
    #     # Ensure lists are not empty before slicing/printing
    #     if true_labels_filtered[i]:
    #          print(f"  Labels     : {true_labels_filtered[i][:50]")
    #     else:
    #          print("  Labels     : [] (all filtered out)")
    #     if true_predictions_filtered[i]:
    #          print(f"  Predictions: {true_predictions_filtered[i][:50]")
    #     else:
    #          print("  Predictions: [] (all filtered out)")
    # print("-----------------------------")
    # --- End Debugging Print ---

    # Check if filtering resulted in empty lists (e.g., all tokens were padding)
    # Filter out examples where either true labels or predictions became empty after filtering -100
    valid_indices = [i for i, (lbls, preds) in enumerate(zip(true_labels_filtered, true_predictions_filtered)) if lbls and preds]
    if not valid_indices:
        print("Warning: No valid examples found after filtering -100. Returning zero metrics.")
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "accuracy": 0.0}
        
    # Use only the valid examples for metric computation
    final_true_predictions = [true_predictions_filtered[i] for i in valid_indices]
    final_true_labels = [true_labels_filtered[i] for i in valid_indices]

    # Compute all metrics using seqeval
    results = metric.compute(predictions=final_true_predictions, references=final_true_labels)

    # Check if overall F1 is zero and print a suggestion
    if results.get("overall_f1", 0.0) == 0.0:
        print("\nNote: Overall F1 score is 0.0. The model might not be predicting any entities correctly yet.")
        print("Consider training for more epochs, adjusting hyperparameters, or checking label alignment.")

    # Return the main overall metrics required by the Trainer
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
print("compute_metrics function defined.")

# 4. Data Collator (remains the same)
data_collator = DataCollatorForTokenClassification(tokenizer)
print("DataCollatorForTokenClassification initialized.")

# 5. Trainer - Use the custom WeightedLossTrainer
# Ensure class_weights_tensor is defined from the previous cell
if 'class_weights_tensor' not in locals():
   raise NameError("Variable 'class_weights_tensor' not defined. Please run the previous cell.")

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_split, # Use train_split
    eval_dataset=eval_split,   # Use eval_split
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor # Pass the calculated weights
)
print("WeightedLossTrainer initialized.")

WeightedLossTrainer class defined.
Setting up training components with weighted loss...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model re-loaded: xlm-roberta-large with 47 labels.
TrainingArguments defined for weighted loss run (v2).
TrainingArguments defined for weighted loss run (v2).
compute_metrics function defined.
DataCollatorForTokenClassification initialized.
compute_metrics function defined.
DataCollatorForTokenClassification initialized.


  super().__init__(*args, **kwargs)


WeightedLossTrainer initialized.


## Start Training

Execute the training process.

In [10]:
# Start the training with weighted loss
if 'trainer' in locals() and isinstance(trainer, WeightedLossTrainer):
    print("\nStarting training on the combined dataset with weighted loss (v2)...")
    train_result = trainer.train()
    print("\nTraining finished.")

    # Save the final model, tokenizer, and training arguments
    final_model_path = training_args.output_dir # Use path from args
    trainer.save_model() # Saves to output_dir defined in training_args
    print(f"Model saved to {final_model_path}")

    # Log metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print("Training metrics and state saved.")
    
    # Evaluate the best model on the evaluation set
    print("\nEvaluating the best model on the evaluation set...")
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)
    trainer.save_metrics("eval", eval_metrics)
    print("Evaluation metrics saved.")
    print(eval_metrics) # Print final eval metrics
else:
    print("\nWeightedLossTrainer was not initialized correctly. Cannot start training. Please check the setup cell.")


Starting training on the combined dataset with weighted loss (v2)...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Detailed Error Analysis

Let's examine the performance on the evaluation set in more detail, looking at the precision, recall, and F1-score for each entity type.

In [None]:
from seqeval.metrics import classification_report
import numpy as np

print("Performing detailed evaluation and error analysis...")

# Ensure trainer and eval_split exist
if 'trainer' in locals() and 'eval_split' in locals():
    # Get predictions
    predictions_output = trainer.predict(eval_split)
    predictions = np.argmax(predictions_output.predictions, axis=2)
    true_labels = predictions_output.label_ids

    # Convert indices to labels, removing -100
    true_predictions_list = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, true_labels)
    ]
    true_labels_list = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, true_labels)
    ]
    
    # Filter out empty examples if any occurred after removing -100
    valid_indices = [i for i, (lbls, preds) in enumerate(zip(true_labels_list, true_predictions_list)) if lbls and preds]
    final_true_predictions = [true_predictions_list[i] for i in valid_indices]
    final_true_labels = [true_labels_list[i] for i in valid_indices]

    if final_true_labels and final_true_predictions:
        # Generate and print the classification report
        report = classification_report(final_true_labels, final_true_predictions, digits=4)
        print("\nClassification Report:")
        print(report)
        
        # Save the report to a file
        report_path = Path(training_args.output_dir) / "classification_report.txt"
        try:
            with open(report_path, "w") as f:
                f.write(report)
            print(f"\nClassification report saved to: {report_path}")
        except Exception as e:
            print(f"\nError saving classification report: {e}")
            
    else:
         print("\nCould not generate classification report: No valid labels/predictions found after filtering.")
else:
    print("\nCould not perform error analysis: 'trainer' or 'eval_split' not found.")

Performing detailed evaluation and error analysis...

Classification Report:
                                  precision    recall  f1-score   support

             Appeal_to_Authority     0.0008    0.0952    0.0016        21
        Appeal_to_Fear-Prejudice     0.0014    0.0377    0.0026        53
             Appeal_to_Hypocrisy     0.0000    0.0000    0.0000        29
            Appeal_to_Popularity     0.0000    0.0000    0.0000        10
                  Appeal_to_Time     0.0030    0.1667    0.0060         6
                Appeal_to_Values     0.0014    0.0400    0.0027        25
       Causal_Oversimplification     0.0000    0.0000    0.0000        20
Consequential_Oversimplification     0.0000    0.0000    0.0000        16
             Conversation_Killer     0.0018    0.0588    0.0034        34
                           Doubt     0.0000    0.0000    0.0000       239
       Exaggeration-Minimisation     0.0020    0.0351    0.0038        57
         False_Dilemma-No_Choice  

In [None]:
import json
import datetime
from pathlib import Path
from transformers import TrainingArguments # Assuming TrainingArguments is needed
# Assuming seqeval report format is a string, adjust if it's a dict
from typing import Dict, Any, Union 

def export_experiment_results(
    training_args: TrainingArguments,
    tokenizer_name: str,
    eval_metrics: Dict[str, float],
    report: Union[str, Dict[str, Any]], # Allow string or dict for report
    output_dir_override: str = None
):
    """
    Exports experiment results (config, metrics, report) to a timestamped JSON file.

    Args:
        training_args: The TrainingArguments object used for the run.
        tokenizer_name: The name of the tokenizer/model used.
        eval_metrics: A dictionary containing evaluation metrics (e.g., from trainer.evaluate()).
        report: The classification report (string or dictionary).
        output_dir_override: Optional path to specify a different output directory. 
                             If None, uses training_args.output_dir.
    """
    print("Exporting experiment results...")

    # Create a timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Prepare data for export
    export_data = {
        "timestamp": timestamp,
        "model_name": tokenizer_name,
        "training_arguments": training_args.to_dict(), # Convert TrainingArguments to dict
        "evaluation_metrics": eval_metrics,
        "classification_report": report
    }
    
    # Define output path
    if output_dir_override:
        output_dir = Path(output_dir_override)
    else:
        output_dir = Path(training_args.output_dir)
        
    output_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
    export_filename = output_dir / f"experiment_summary_{timestamp}.json"
    
    # Save data to JSON file
    try:
        with open(export_filename, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, ensure_ascii=False, indent=4)
        print(f"Experiment summary saved successfully to: {export_filename}")
    except Exception as e:
        print(f"Error saving experiment summary: {e}")

export_experiment_results(
    training_args,
    tokenizer_name,
    eval_metrics,
    report,
    output_dir_override='../experiment_results' # Example override path (adjust as needed)
)

Exporting experiment results...
Experiment summary saved successfully to: ../experiment_results/experiment_summary_20250501_210146.json


## Clear Model from Memory

Delete the model and trainer objects and clear the GPU cache to free up memory.

In [11]:
import torch
import gc

print("Clearing model and trainer from memory...")

# Check if variables exist before deleting
if 'model' in locals():
    del model
    print("  Deleted 'model' variable.")
if 'trainer' in locals():
    del trainer
    print("  Deleted 'trainer' variable.")
# Add any other large variables related to the model if needed
# e.g., del optimizer, del scheduler

# Run Python's garbage collector
gc.collect()
print("  Ran garbage collector.")

# Clear PyTorch's CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("  Cleared PyTorch CUDA cache.")
else:
    print("  CUDA not available, skipping cache clearing.")

print("Memory clearing process finished.")

Clearing model and trainer from memory...
  Deleted 'model' variable.
  Deleted 'trainer' variable.
  Ran garbage collector.
  Cleared PyTorch CUDA cache.
Memory clearing process finished.
  Cleared PyTorch CUDA cache.
Memory clearing process finished.
