# XLM-RoBERTa NER Training

Fine-tune [`xlm-roberta-base`](https://huggingface.co/xlm-roberta-base) for token classification over trading entities. This notebook mirrors `ai/training/hf/train_ner.py` and is tailored for Google Colab usage.

## 1. Environment setup
Install the dependencies required for Hugging Face token classification.

In [16]:
!pip install -q datasets transformers evaluate accelerate sentencepiece seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


## 2. (Optional) Mount Google Drive
Mount Google Drive if your JSONL dataset lives there. Otherwise upload it via the file browser.

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')
else:
    print('Not running inside Colab; skipping Drive mount.')

Mounted at /content/drive


## 3. Imports and configuration
Update the configuration defaults to match your dataset locations or output preferences.

In [14]:
from dataclasses import dataclass
from functools import partial
from typing import Dict, List, Tuple

import evaluate
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

ENTITY_LABELS = [
    'O',
    'B-PAIR',
    'I-PAIR',
    'B-LEVERAGE',
    'I-LEVERAGE',
    'B-ENTRY',
    'I-ENTRY',
    'B-STOP_LOSS',
    'I-STOP_LOSS',
    'B-TARGET',
    'I-TARGET',
]
LABEL2ID = {label: idx for idx, label in enumerate(ENTITY_LABELS)}
ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}
ENTITY_TO_PREFIX = {
    'PAIR': 'PAIR',
    'LEVERAGE': 'LEVERAGE',
    'ENTRY': 'ENTRY',
    'STOP_LOSS': 'STOP_LOSS',
    'TARGET': 'TARGET',
}

@dataclass
class TrainingConfig:
    data_file: str = '/content/ner_data.jsonl'
    output_dir: str = '/content/ner_extractor'
    epochs: int = 10
    batch_size: int = 16
    eval_batch_size: int = 32
    learning_rate: float = 3e-5
    test_split: float = 0.1
    seed: int = 42
    use_fp16: bool = True
    gradient_accumulation_steps: int = 2

config = TrainingConfig()
config

TrainingConfig(data_file='/content/ner_data.jsonl', output_dir='/content/ner_extractor', epochs=10, batch_size=16, eval_batch_size=32, learning_rate=3e-05, test_split=0.1, seed=42, use_fp16=True, gradient_accumulation_steps=2)

## 4. Tokenization and label alignment
Helper that aligns entity spans to token indices, mirroring the CLI script logic.

In [11]:
def tokenize_and_align_labels(tokenizer: AutoTokenizer, examples: Dict[str, List]):
    texts = examples['text']
    entities = examples['entities']

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=False,
        return_offsets_mapping=True,
        max_length=256,
    )

    aligned_labels: List[List[int]] = []

    for offsets, sentence_entities in zip(tokenized['offset_mapping'], entities):
        labels = ['O'] * len(offsets)

        for entity in sentence_entities:
            ent_start = int(entity['start'])
            ent_end = int(entity['end'])
            ent_label = ENTITY_TO_PREFIX.get(entity['label'])
            if ent_label is None:
                continue

            for idx, (tok_start, tok_end) in enumerate(offsets):
                if tok_start == tok_end:
                    continue
                if tok_end <= ent_start or tok_start >= ent_end:
                    continue
                prefix = 'B' if tok_start == ent_start else 'I'
                labels[idx] = f"{prefix}-{ent_label}"


        label_ids: List[int] = []
        for (tok_start, tok_end), label in zip(offsets, labels):
            if tok_start == tok_end:
                label_ids.append(-100)
            else:
                label_ids.append(LABEL2ID[label])

        aligned_labels.append(label_ids)

    tokenized['labels'] = aligned_labels
    tokenized.pop('offset_mapping')
    return tokenized

## 5. Dataset preparation
Load the JSONL dataset, create train/eval splits, and tokenize with span alignment.

In [12]:
def load_and_prepare_dataset(
    data_file: str, test_split: float, seed: int
) -> Tuple[DatasetDict, AutoTokenizer]:
    dataset_dict = load_dataset("json", data_files=data_file)
    dataset = dataset_dict["train"]
    dataset = dataset.train_test_split(test_size=test_split, seed=seed)

    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    tokenize_fn = partial(tokenize_and_align_labels, tokenizer)

    tokenized = dataset.map(tokenize_fn, batched=True)
    return DatasetDict(
        {"train": tokenized["train"], "eval": tokenized["test"]}
    ), tokenizer

dataset_splits, tokenizer = load_and_prepare_dataset(config.data_file, config.test_split, config.seed)
dataset_splits

Map:   0%|          | 0/6408 [00:00<?, ? examples/s]

Map:   0%|          | 0/713 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'entities', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6408
    })
    eval: Dataset({
        features: ['text', 'entities', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 713
    })
})

## 6. Metric computation helper
Reuse the `seqeval` metrics to compute precision, recall, F1, and accuracy.

In [7]:
def compute_metrics(eval_pred) -> Dict[str, float]:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    true_predictions: List[List[str]] = []
    true_labels: List[List[str]] = []

    for pred_seq, label_seq in zip(predictions, labels):
        filtered_preds: List[str] = []
        filtered_labels: List[str] = []

        for pred_id, label_id in zip(pred_seq, label_seq):
            if label_id == -100:
                continue
            filtered_preds.append(ID2LABEL[pred_id])
            filtered_labels.append(ID2LABEL[label_id])

        true_predictions.append(filtered_preds)
        true_labels.append(filtered_labels)

    metric = evaluate.load('seqeval')
    scores = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        'precision': scores['overall_precision'],
        'recall': scores['overall_recall'],
        'f1': scores['overall_f1'],
        'accuracy': scores['overall_accuracy'],
    }

## 7. Training
Configure the Hugging Face `Trainer`, kick off fine-tuning, and inspect evaluation metrics.

In [17]:
set_seed(config.seed)

model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=len(ENTITY_LABELS),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=config.output_dir,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    learning_rate=config.learning_rate,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    num_train_epochs=config.epochs,
    weight_decay=0.01,
    logging_steps=50,
    fp16=config.use_fp16,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_splits['train'],
    eval_dataset=dataset_splits['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
metrics = trainer.evaluate()
metrics

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0379,0.032055,0.875616,0.907119,0.891089,0.990267
2,0.0276,0.026113,0.890963,0.925746,0.908022,0.991424
3,0.0233,0.023452,0.904691,0.930084,0.917212,0.992393
4,0.0182,0.021562,0.922177,0.925236,0.923704,0.992949
5,0.0146,0.021293,0.911933,0.937994,0.92478,0.99338
6,0.0153,0.020141,0.914107,0.934167,0.924028,0.993433
7,0.013,0.020816,0.919609,0.936974,0.92821,0.993604
8,0.0111,0.020171,0.928067,0.93825,0.93313,0.994043
9,0.0106,0.020645,0.927136,0.941567,0.934295,0.993945
10,0.0116,0.020865,0.923115,0.940546,0.931749,0.9939


{'eval_loss': 0.020645324140787125,
 'eval_precision': 0.9271356783919598,
 'eval_recall': 0.9415667262056647,
 'eval_f1': 0.9342954804405621,
 'eval_accuracy': 0.9939446677192478,
 'eval_runtime': 4.7485,
 'eval_samples_per_second': 150.153,
 'eval_steps_per_second': 4.844,
 'epoch': 10.0}

## 8. Save artifacts
Persist the fine-tuned model and tokenizer to the configured output directory.

In [18]:
trainer.save_model(config.output_dir)
tokenizer.save_pretrained(config.output_dir)
print(f'Model saved to {config.output_dir}')

Model saved to /content/ner_extractor


In [20]:
!zip -r /content/ner_extractor.zip /content/ner_extractor/config.json /content/ner_extractor/model.safetensors /content/ner_extractor/sentencepiece.bpe.model /content/ner_extractor/tokenizer.json /content/ner_extractor/special_tokens_map.json /content/ner_extractor/tokenizer_config.json /content/ner_extractor/training_args.bin

  adding: content/ner_extractor/config.json (deflated 55%)
  adding: content/ner_extractor/model.safetensors (deflated 25%)
  adding: content/ner_extractor/sentencepiece.bpe.model (deflated 49%)
  adding: content/ner_extractor/tokenizer.json (deflated 76%)
  adding: content/ner_extractor/special_tokens_map.json (deflated 52%)
  adding: content/ner_extractor/tokenizer_config.json (deflated 76%)
  adding: content/ner_extractor/training_args.bin (deflated 53%)


In [21]:
!mv /content/ner_extractor.zip /content/drive/MyDrive/ner_extractor.zip