In [1]:
import os, re
from collections import defaultdict
from typing import List, Tuple, Dict, Set
from pathlib import Path # Added import

def read_article(path: str) -> str:
    with open(path, "rb") as f:
        raw = f.read()
    return raw.decode("utf-8", "ignore")

def load_span_labels(label_file: str
    ) -> Dict[str, List[Tuple[str, int, int]]]:
    
    spans = defaultdict(list)
    with open(label_file, encoding="utf-8") as f:
        for line in f:
            # Handle potential empty lines or lines with incorrect format
            parts = line.rstrip().split("\t")
            if len(parts) == 4:
                art_id, lab, s, e = parts
                try:
                    spans[art_id].append((lab, int(s), int(e)))
                except ValueError:
                    print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
            elif line.strip(): # Print warning for non-empty, but malformed lines
                 print(f"Warning: Skipping malformed line in {label_file}: {line.rstrip()}")
    return spans

# Added function to extract base classes from the span file
def get_base_classes_from_spans(label_file: str) -> Set[str]:
    base_classes = set()
    with open(label_file, encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split("\t")
            if len(parts) == 4:
                _, lab, _, _ = parts
                base_classes.add(lab)
            elif line.strip():
                 # Warnings handled in load_span_labels, no need to repeat here
                 pass
    return base_classes

def build_label_maps(
    base_classes: set[str] # Changed input to accept a set of base classes
) -> tuple[list[str], dict[str, int], dict[int, str]]:
    """
    Input
    -----
    base_classes – A set containing all unique base label names (e.g., "Appeal_to_Fear-Prejudice")

    Output
    ------
    bio_tags   – full list like ["O", "B-Appeal_to_Fear-Prejudice", "I-Appeal_to_Fear-Prejudice", …]
    label2id   – {"O": 0, "B-…": 1, …}   – used to turn tags into numbers
    id2label   – inverse of label2id      – needed by the model/Trainer
    """
    # 1️⃣ Use the provided base classes
    sorted_base_classes = sorted(list(base_classes))

    # 2️⃣ build BIO strings
    bio_tags = ["O"]                      # outside any span
    for cls in sorted_base_classes:
        bio_tags.extend([f"B-{cls}", f"I-{cls}"])   # beginning / inside

    # 3️⃣ numeric maps
    label2id = {tag: i for i, tag in enumerate(bio_tags)}
    id2label = {i: tag for tag, i in label2id.items()}

    return bio_tags, label2id, id2label

In [2]:
from transformers import XLMRobertaTokenizerFast

# Load the pre-trained XLM-RoBERTa tokenizer
# You can replace "xlm-roberta-base" with a specific model if needed
tokenizer_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(tokenizer_name)

def tokenize_text(text: str):
    """
    Tokenizes the input text using the pre-loaded XLMRobertaTokenizerFast.

    Args:
        text: The input string to tokenize.

    Returns:
        A dictionary containing the tokenized 'input_ids', 'attention_mask', etc.
    """
    # Tokenize the text, adding common options like truncation and max_length
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,  # You can adjust the max_length if needed
        return_tensors="pt", # Return PyTorch tensors, change to "tf" for TensorFlow if required
        return_offsets_mapping=True, # Useful for aligning tokens with original text
    )
    return tokenized_inputs

# Example of how to use the function (optional)
sample_text = "Here is some text to tokenize."
tokenized_result = tokenize_text(sample_text)
print(tokenized_result)

  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([[    0, 11853,    83,  3060,  7986,    47,    47,  1098, 20650,     5,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'offset_mapping': tensor([[[ 0,  0],
         [ 0,  4],
         [ 5,  7],
         [ 8, 12],
         [13, 17],
         [18, 20],
         [21, 23],
         [23, 26],
         [26, 29],
         [29, 30],
         [ 0,  0]]])}


In [3]:
from transformers.tokenization_utils_base import BatchEncoding

def align_labels_with_tokens(
    tokenized_inputs: BatchEncoding,
    spans: List[Tuple[str, int, int]]
) -> List[str]:
    """
    Aligns character-level span labels with token-level BIO labels.
    Handles cases where token spans partially overlap with label spans.

    Args:
        tokenized_inputs: Output from the tokenizer (must include 'offset_mapping').
        spans: A list of tuples, where each tuple is (label, start_char, end_char).

    Returns:
        A list of BIO labels (e.g., "O", "B-Label", "I-Label") corresponding to each token.
    """
    # Ensure offset mapping is present
    if 'offset_mapping' not in tokenized_inputs:
        raise ValueError("Tokenizer output must include 'offset_mapping'.")

    # offset_mapping is typically shape (batch_size, sequence_length, 2)
    # Assuming batch_size is 1 for this function
    # Removed .tolist() as the input might already be a list
    offsets = tokenized_inputs['offset_mapping'][0]
    num_tokens = len(offsets)
    labels = ["O"] * num_tokens # Initialize all labels as Outside

    # Sort spans by start index to handle potential overlaps consistently (optional but good practice)
    # spans.sort(key=lambda x: x[1]) # Uncomment if sorting is desired

    for label, start_char, end_char in spans:
        found_first_token = False
        for token_idx, (token_start, token_end) in enumerate(offsets):
            # Skip special tokens (like [CLS], [SEP]) which have (0, 0) offset
            if token_start == token_end:
                continue

            # Check for overlap between token span and label span
            # This condition is true if there is *any* overlap, including partial overlaps.
            # max(start1, start2) < min(end1, end2)
            if max(token_start, start_char) < min(token_end, end_char):
                # Assign B- tag to the first token overlapping the span
                if not found_first_token:
                    labels[token_idx] = f"B-{label}" # Begin label
                    found_first_token = True
                # Assign I- tag to subsequent tokens overlapping the *same* span
                else:
                    labels[token_idx] = f"I-{label}" # Inside label
            # Optimization: If the token starts after the span ends,
            # we don't need to check further for this span.
            # elif token_start >= end_char:
            #     break # Uncomment if spans are sorted by start_char

    return labels

# Example Usage (using variables from previous cells if run in the same notebook)

# Sample spans (replace with actual spans for your data)
sample_spans = [
    ("LOC", 8, 12),   # Corresponds to "some" in "Here is some text to tokenize."
    ("VERB", 21, 29) # Corresponds to "tokenize"
]

# Use the previously tokenized result
aligned_labels = align_labels_with_tokens(tokenized_result, sample_spans)

# Print tokens and their corresponding labels
tokens = tokenizer.convert_ids_to_tokens(tokenized_result['input_ids'][0])

print("Tokens:", tokens)
print("Labels:", aligned_labels)

# Verify alignment (optional)
for token, label, offset in zip(tokens, aligned_labels, tokenized_result['offset_mapping'][0].tolist()):
    print(f"{token:<15} {label:<15} {offset}")

Tokens: ['<s>', '▁Here', '▁is', '▁some', '▁text', '▁to', '▁to', 'ken', 'ize', '.', '</s>']
Labels: ['O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-VERB', 'I-VERB', 'I-VERB', 'O', 'O']
<s>             O               [0, 0]
▁Here           O               [0, 4]
▁is             O               [5, 7]
▁some           B-LOC           [8, 12]
▁text           O               [13, 17]
▁to             O               [18, 20]
▁to             B-VERB          [21, 23]
ken             I-VERB          [23, 26]
ize             I-VERB          [26, 29]
.               O               [29, 30]
</s>            O               [0, 0]


In [4]:
def create_token_label_mapping(
    span_label_file: str,
    article_dir: str,
    tokenizer: XLMRobertaTokenizerFast,
    max_length: int = 512
) -> Tuple[Dict[str, BatchEncoding], Dict[str, List[int]], Dict[str, int], Dict[int, str]]:
    """
    Creates a mapping from article ID to tokenized inputs and numerical BIO labels.

    Args:
        span_label_file: Path to the file containing span labels (tsv format).
        article_dir: Path to the directory containing the article text files.
        tokenizer: An initialized Hugging Face tokenizer (e.g., XLMRobertaTokenizerFast).
        max_length: Maximum sequence length for tokenization.

    Returns:
        A tuple containing:
        - tokenized_articles: Dict mapping article_id to tokenized BatchEncoding.
        - article_label_ids: Dict mapping article_id to list of numerical label IDs.
        - label2id: Dictionary mapping BIO label strings to integer IDs.
        - id2label: Dictionary mapping integer IDs to BIO label strings.
    """
    print(f"Loading spans from: {span_label_file}")
    spans_by_article = load_span_labels(span_label_file)
    total_articles = len(spans_by_article)
    print(f"Found spans for {total_articles} articles.")

    print("Extracting base classes...")
    base_classes = get_base_classes_from_spans(span_label_file)
    print(f"Found {len(base_classes)} unique base classes.")

    print("Building label maps...")
    bio_tags, label2id, id2label = build_label_maps(base_classes)
    print(f"Total BIO tags: {len(bio_tags)}")

    tokenized_articles = {}
    article_label_ids = {}
    article_dir_path = Path(article_dir)

    print(f"Processing articles from: {article_dir}")
    processed_count = 0
    for art_id, spans in spans_by_article.items():
        processed_count += 1
        if processed_count % 100 == 0: # Print progress every 100 articles
             print(f"Processing article {processed_count}/{total_articles} (ID: {art_id})...")

        article_path = article_dir_path / f"article{art_id}.txt"
        if not article_path.exists():
            print(f"Warning: Article file not found, skipping: {article_path}")
            continue

        text = read_article(str(article_path))

        # Tokenize (using the function from the previous cell, adapting args)
        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            return_offsets_mapping=True,
            # No return_tensors="pt" here, keep as lists for now
        )

        # Align character spans to token BIO labels
        # Need to wrap tokenized_inputs for align_labels_with_tokens
        # It expects BatchEncoding, let's simulate batch size 1
        temp_batch_encoding = BatchEncoding({
            'input_ids': [tokenized_inputs['input_ids']],
            'attention_mask': [tokenized_inputs['attention_mask']],
            'offset_mapping': [tokenized_inputs['offset_mapping']]
        })
        bio_labels = align_labels_with_tokens(temp_batch_encoding, spans)

        # Convert BIO labels to numerical IDs
        label_ids = [label2id.get(label, label2id["O"]) for label in bio_labels]

        # Store results (remove offset mapping if not needed later)
        tokenized_inputs.pop("offset_mapping")
        tokenized_articles[art_id] = tokenized_inputs
        article_label_ids[art_id] = label_ids

    print(f"Finished processing {processed_count} articles.")
    return tokenized_articles, article_label_ids, label2id, id2label

# --- Example Usage ---

# Define paths (adjust if necessary)
span_file = "/home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans.txt"
# Assuming articles are in a sibling 'articles/ru' directory relative to 'processed/ru'
articles_path = "/home/twoface/persuasion-detection/data/processed/ru/unwrapped-articles" 

# Ensure the tokenizer is loaded (from the second cell)
if 'tokenizer' not in locals():
    print("Tokenizer not found, please run the tokenizer loading cell first.")
else:
    # Call the function
    tokenized_data, label_data, l2i, i2l = create_token_label_mapping(
        span_file,
        articles_path,
        tokenizer
    )

    # Display results for a sample article (if data was processed)
    if tokenized_data:
        sample_id = list(tokenized_data.keys())[0]
        print(f"\n--- Sample Article ID: {sample_id} ---")
        print("Tokenized Input Keys:", tokenized_data[sample_id].keys())
        print("Number of Tokens:", len(tokenized_data[sample_id]['input_ids']))
        print("Number of Labels:", len(label_data[sample_id]))
        print("Label2ID mapping (sample):", list(l2i.items())[:5])
    else:
        print("\nNo articles were processed. Check paths and file existence.")

Loading spans from: /home/twoface/persuasion-detection/data/processed/ru/train-labels-subtask-3-spans.txt
Found spans for 190 articles.
Extracting base classes...
Found 23 unique base classes.
Building label maps...
Total BIO tags: 47
Processing articles from: /home/twoface/persuasion-detection/data/processed/ru/unwrapped-articles
Processing article 100/190 (ID: 2465)...
Finished processing 190 articles.

--- Sample Article ID: 24151 ---
Tokenized Input Keys: dict_keys(['input_ids', 'attention_mask'])
Number of Tokens: 459
Number of Labels: 459
Label2ID mapping (sample): [('O', 0), ('B-Appeal_to_Authority', 1), ('I-Appeal_to_Authority', 2), ('B-Appeal_to_Fear-Prejudice', 3), ('I-Appeal_to_Fear-Prejudice', 4)]


In [5]:
from datasets import Dataset
from typing import Dict, List
from transformers.tokenization_utils_base import BatchEncoding # Already imported but good practice

def create_hf_dataset(
    tokenized_articles: Dict[str, Dict[str, List[int]]], # Adjusted type hint
    article_label_ids: Dict[str, List[int]],
    label2id: Dict[str, int] # Keep label2id for potential future use or validation
) -> Dataset:
    """
    Creates a Hugging Face Dataset object from tokenized articles and labels.

    Args:
        tokenized_articles: Dict mapping article_id to tokenized inputs (Dict with 'input_ids', 'attention_mask').
        article_label_ids: Dict mapping article_id to list of numerical label IDs.
        label2id: Dictionary mapping BIO label strings to integer IDs (optional, could be used for validation).

    Returns:
        A Hugging Face Dataset object with columns 'input_ids', 'attention_mask', 'labels'.
    """
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    print(f"Preparing dataset from {len(tokenized_articles)} articles...")
    skipped_count = 0
    for art_id in tokenized_articles.keys():
        if art_id not in article_label_ids:
            print(f"Warning: Labels not found for article {art_id}. Skipping.")
            skipped_count += 1
            continue

        input_ids = tokenized_articles[art_id]['input_ids']
        attention_mask = tokenized_articles[art_id]['attention_mask']
        labels = article_label_ids[art_id]

        # Sanity check: lengths must match
        if not (len(input_ids) == len(attention_mask) == len(labels)):
            print(f"Warning: Length mismatch for article {art_id}. ",
                  f"input_ids: {len(input_ids)}, attention_mask: {len(attention_mask)}, labels: {len(labels)}. Skipping.")
            skipped_count += 1
            continue

        all_input_ids.append(input_ids)
        all_attention_masks.append(attention_mask)
        all_labels.append(labels)

    print(f"Finished preparing dataset. Total articles processed: {len(all_input_ids)}. Skipped: {skipped_count}.")

    if not all_input_ids: # Handle case where no data was processed
        print("Error: No valid data found to create dataset.")
        return None

    dataset_dict = {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    }

    # Create the Dataset object
    hf_dataset = Dataset.from_dict(dataset_dict)
    return hf_dataset

# --- Example Usage ---

# Check if the required variables exist from the previous cell
if 'tokenized_data' in locals() and 'label_data' in locals() and 'l2i' in locals():
    print("\nCreating Hugging Face dataset...")
    # Ensure the data is not empty before proceeding
    if tokenized_data and label_data:
        train_dataset = create_hf_dataset(tokenized_data, label_data, l2i)

        if train_dataset:
            print("\nDataset created successfully!")
            print(train_dataset)
            # You can inspect the first example:
            print("\nFirst example:", train_dataset[0])
        else:
            print("\nDataset creation failed.")
    else:
        print("\nCannot create dataset: 'tokenized_data' or 'label_data' is empty.")
else:
    print("\nPlease run the previous cells to generate 'tokenized_data', 'label_data', and 'l2i'.")


Creating Hugging Face dataset...
Preparing dataset from 190 articles...
Finished preparing dataset. Total articles processed: 190. Skipped: 0.

Dataset created successfully!
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 190
})

First example: {'input_ids': [0, 31420, 2582, 105, 29, 41883, 546, 407, 1739, 142317, 718, 190012, 1258, 49, 119065, 20, 75533, 29, 41883, 547, 407, 82847, 11519, 27546, 31420, 83420, 718, 190012, 1258, 75533, 29, 41883, 547, 407, 82847, 11519, 27546, 31420, 83420, 718, 190012, 1258, 77900, 65346, 1082, 29440, 14560, 1086, 183, 38088, 175436, 407, 23334, 97430, 44140, 16176, 190012, 292, 1097, 39061, 969, 54662, 53173, 49, 155980, 969, 54662, 103, 5999, 5, 5187, 79015, 111918, 75533, 4, 135, 24010, 62415, 130, 237748, 13022, 14560, 78568, 31420, 2582, 5, 4531, 214461, 4, 414, 4060, 22051, 5509, 13299, 86146, 197, 14560, 126750, 25514, 8808, 43123, 87781, 22480, 32187, 5, 94, 170196, 13299, 77, 23494, 3505, 86665, 70209, 146666

In [6]:
# Inspect the created dataset

if 'train_dataset' in locals() and train_dataset is not None and len(train_dataset) > 0:
    print("--- Inspecting the first example (train_dataset[0]) ---")
    first_example = train_dataset[0]
    print(first_example)
    print("\nNumber of tokens:", len(first_example['input_ids']))
    print("Number of labels:", len(first_example['labels']))
    print("Number of attention mask values:", len(first_example['attention_mask']))

    print("\n--- Inspecting the first 3 examples (train_dataset[:3]) ---")
    # Slicing returns a dictionary where each key maps to a list of values for that slice
    first_three_examples = train_dataset[:3]
    # Print lengths to show structure
    print("Keys:", first_three_examples.keys())
    print("Number of examples in slice:", len(first_three_examples['input_ids'])) # Should be 3
    # Optionally print the details of the first example within the slice
    # print("\nDetails of first example in slice:")
    # print({"input_ids": first_three_examples['input_ids'][0], 
    #        "attention_mask": first_three_examples['attention_mask'][0], 
    #        "labels": first_three_examples['labels'][0]})
elif 'train_dataset' in locals() and train_dataset is not None:
     print("The train_dataset is empty.")
else:
    print("Variable 'train_dataset' not found or is None. Please run the previous cell to create it.")

--- Inspecting the first example (train_dataset[0]) ---
{'input_ids': [0, 31420, 2582, 105, 29, 41883, 546, 407, 1739, 142317, 718, 190012, 1258, 49, 119065, 20, 75533, 29, 41883, 547, 407, 82847, 11519, 27546, 31420, 83420, 718, 190012, 1258, 75533, 29, 41883, 547, 407, 82847, 11519, 27546, 31420, 83420, 718, 190012, 1258, 77900, 65346, 1082, 29440, 14560, 1086, 183, 38088, 175436, 407, 23334, 97430, 44140, 16176, 190012, 292, 1097, 39061, 969, 54662, 53173, 49, 155980, 969, 54662, 103, 5999, 5, 5187, 79015, 111918, 75533, 4, 135, 24010, 62415, 130, 237748, 13022, 14560, 78568, 31420, 2582, 5, 4531, 214461, 4, 414, 4060, 22051, 5509, 13299, 86146, 197, 14560, 126750, 25514, 8808, 43123, 87781, 22480, 32187, 5, 94, 170196, 13299, 77, 23494, 3505, 86665, 70209, 146666, 5, 134739, 93142, 3318, 30236, 32578, 77, 100998, 4798, 35, 8044, 3795, 113360, 4, 3077, 81871, 73712, 130, 4, 136037, 53711, 16176, 190012, 1339, 292, 83694, 31420, 2582, 5, 672, 26512, 15126, 33330, 80649, 43155, 50406,

## Model Training Setup

Now we set up the components needed for training:
1.  **Load Model**: Load `XLMRobertaForTokenClassification` with the correct number of labels.
2.  **Training Arguments**: Configure hyperparameters like learning rate, batch size, epochs, and output directories.
3.  **Metrics**: Define a function to compute evaluation metrics (precision, recall, F1) using `seqeval`.
4.  **Data Collator**: Use `DataCollatorForTokenClassification` for dynamic padding.
5.  **Trainer**: Initialize the `Trainer`.

In [10]:
# Split the dataset (e.g., 90% train, 10% evaluation)
# Use shuffle=True (default) and a seed for reproducibility
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)

# Assign the splits to new variables
train_split = split_dataset['train']
eval_split = split_dataset['test'] # The 'test' key holds the evaluation split

print(f"Original dataset size: {len(train_dataset)}")
print(f"Train split size: {len(train_split)}")
print(f"Evaluation split size: {len(eval_split)}")

# Display the structure of the splits
print("\nTrain split structure:")
print(train_split)
print("\nEvaluation split structure:")
print(eval_split)

Original dataset size: 190
Train split size: 171
Evaluation split size: 19

Train split structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 171
})

Evaluation split structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 19
})


In [11]:
from transformers import XLMRobertaForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate # Use evaluate instead of load_metric

print("Setting up training components...")
# 1. Load Model
model = XLMRobertaForTokenClassification.from_pretrained(
    tokenizer_name,
    id2label=i2l,
    label2id=l2i
)
print(f"Model loaded: {tokenizer_name} with {model.config.num_labels} labels.")

# 2. Training Arguments
# Note: Adjust these arguments based on your resources and needs
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    # Add save_strategy if you want to save checkpoints
    # load_best_model_at_end=True, # Requires save_strategy and evaluation_strategy
)
print("TrainingArguments defined.")

# 3. Metrics Calculation
metric = evaluate.load("seqeval") # Use evaluate.load
# Get the actual label names (without B-/I- prefixes for seqeval)
# Assuming i2l contains labels like 'O', 'B-Label1', 'I-Label1', ...
label_list = list(i2l.values())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens) - where label is -100
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
print("compute_metrics function defined.")

# 4. Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)
print("DataCollatorForTokenClassification initialized.")

# 5. Trainer
# Assuming train_dataset is your training data.
# If you have a separate validation set, add it as `eval_dataset`
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_split,
    eval_dataset=eval_split, # Using train_dataset for evaluation here, replace if you have a validation set
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

Setting up training components...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: xlm-roberta-base with 47 labels.
TrainingArguments defined.
compute_metrics function defined.
DataCollatorForTokenClassification initialized.
compute_metrics function defined.
DataCollatorForTokenClassification initialized.


  trainer = Trainer(


Trainer initialized.


## Start Training

Execute the training process.

In [12]:
# Start the training
if 'trainer' in locals() and trainer is not None:
    print("\nStarting training...")
    train_result = trainer.train()
    print("\nTraining finished.")

    # Optionally, save the final model, tokenizer, and training arguments
    # trainer.save_model("./final_model") 
    # print("Model saved to ./final_model")

    # Log metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
else:
    print("\nTrainer was not initialized. Cannot start training. Please check the setup cell.")


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,3.1495,2.121316,0.0,0.0,0.0,0.599847
2,1.9381,1.86292,0.0,0.0,0.0,0.599847
3,1.6839,1.819779,0.0,0.0,0.0,0.599847


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               =   124890GF
  train_loss               =     2.2091
  train_runtime            = 0:01:11.86
  train_samples_per_second =      7.138
  train_steps_per_second   =      0.459
