# Hugging Face Transformers

## 1. Library Imports

In [None]:
import json
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from datasets import Dataset as HFDataset
from datasets import DatasetDict
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt
import matplotlib.patches as patches

## 3. Constant Definition 

In this cell, I´ll document the type of entities and their correspondant colors.

In [None]:
# Define entity types and their descriptions
ENTITY_TYPES = {
    "ACTION": "Direct commands or actions mentioned in the message",
    "SITUATION": "Racing context or circumstance descriptions",
    "INCIDENT": "Accidents or on-track events",
    "STRATEGY_INSTRUCTION": "Strategic directives",
    "POSITION_CHANGE": "References to overtakes or positions",
    "PIT_CALL": "Specific calls for pit stops",
    "TRACK_CONDITION": "Mentions of the track's state",
    "TECHNICAL_ISSUE": "Mechanical or car-related problems",
    "WEATHER": "References to weather conditions"
}

# Color scheme for entity visualization
ENTITY_COLORS = {
    "ACTION": "#4e79a7",           # Blue
    "SITUATION": "#f28e2c",         # Orange
    "INCIDENT": "#e15759",          # Red
    "STRATEGY_INSTRUCTION": "#76b7b2", # Teal
    "POSITION_CHANGE": "#59a14f",   # Green
    "PIT_CALL": "#edc949",          # Yellow
    "TRACK_CONDITION": "#af7aa1",   # Purple
    "TECHNICAL_ISSUE": "#ff9da7",   # Pink
    "WEATHER": "#9c755f"            # Brown
}

print("Entity types defined:")
for entity, description in ENTITY_TYPES.items():
    print(f"  - {entity}: {description}")

Types of defined entities:
  - ACTION: Direct commands or actions mentioned in the message
  - SITUATION: Racing context or circumstance descriptions
  - INCIDENT: Accidents or on-track events
  - STRATEGY_INSTRUCTION: Strategic directives
  - POSITION_CHANGE: References to overtakes or positions
  - PIT_CALL: Specific calls for pit stops
  - TRACK_CONDITION: Mentions of the track's state
  - TECHNICAL_ISSUE: Mechanical or car-related problems
  - WEATHER: References to weather conditions


## 4. Load and Explore Data

In [11]:
# Load F1 radio data from JSON file
def load_f1_radio_data(json_file):
    """Load and explore F1 radio data from JSON file"""
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} messages from {json_file}")
    
    # Show sample structure
    if len(data) > 0:
        print("\nSample record structure:")
        sample = data[0]
        print(f"  Driver: {sample.get('driver', 'N/A')}")
        print(f"  Radio message: {sample.get('radio_message', 'N/A')[:100]}...")
        
        if 'annotations' in sample and len(sample['annotations']) > 1:
            if isinstance(sample['annotations'][1], dict) and 'entities' in sample['annotations'][1]:
                entities = sample['annotations'][1]['entities']
                print(f"  Number of entities: {len(entities)}")
                if len(entities) > 0:
                    entity = entities[0]
                    entity_text = sample['radio_message'][entity[0]:entity[1]]
                    print(f"  Sample entity: [{entity[0]}, {entity[1]}, '{entity_text}', '{entity[2]}']")
    
    return data



In [12]:
# Load the JSON data
json_file_path = "f1_radio_entity_annotations.json"
f1_data = load_f1_radio_data(json_file_path)

# Count entity types in the dataset
entity_counts = {}
for item in f1_data:
    if 'annotations' in item and len(item['annotations']) > 1:
        if isinstance(item['annotations'][1], dict) and 'entities' in item['annotations'][1]:
            for _, _, entity_type in item['annotations'][1]['entities']:
                entity_counts[entity_type] = entity_counts.get(entity_type, 0) + 1

print("\nEntity type distribution in dataset:")
for entity_type, count in sorted(entity_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  - {entity_type}: {count}")

Loaded 529 messages from f1_radio_entity_annotations.json

Sample record structure:
  Driver: 1
  Radio message: So don't forget Max, use your head please. Are we both doing it or what? You just follow my instruct...
  Number of entities: 3
  Sample entity: [82, 103, 'follow my instruction', 'ACTION']

Entity type distribution in dataset:
  - SITUATION: 255
  - ACTION: 165
  - STRATEGY_INSTRUCTION: 137
  - TECHNICAL_ISSUE: 137
  - WEATHER: 112
  - POSITION_CHANGE: 83
  - INCIDENT: 78
  - TRACK_CONDITION: 62
  - PIT_CALL: 42


## 5. Preprocessing F1 Radio Data

In [27]:
def preprocess_f1_data(data):
    """Extract and preprocess F1 radio data with valid annotations"""
    processed_data = []
    skipped_count = 0
    
    for item in data:
        if 'radio_message' not in item or 'annotations' not in item:
            skipped_count += 1
            continue
            
        text = item['radio_message']
        
        # Skip items with empty or null text
        if not text or text.strip() == "":
            skipped_count += 1
            continue
            
        # Extract entities if they exist in expected format
        if len(item['annotations']) > 1 and isinstance(item['annotations'][1], dict):
            annotations = item['annotations'][1]
            if 'entities' in annotations and annotations['entities']:
                entities = annotations['entities']
                
                # Add to processed data
                processed_data.append({
                    'text': text,
                    'entities': entities,
                    'driver': item.get('driver', None)
                })
            else:
                skipped_count += 1
        else:
            skipped_count += 1
    
    print(f"Processed {len(processed_data)} messages with valid annotations")
    print(f"Skipped {skipped_count} messages with missing or invalid annotations")
    
    # Show a sample of processed data
    if processed_data:
        sample = processed_data[10]
        print("\nSample processed message:")
        print(f"Text: {sample['text']}")
        print("Entities:")
        for start, end, entity_type in sample['entities']:
            entity_text = sample['text'][start:end]
            print(f"  - [{start}, {end}] '{entity_text}' ({entity_type})")
    
    return processed_data



In [28]:
# Preprocess the loaded data
processed_f1_data = preprocess_f1_data(f1_data)

Processed 399 messages with valid annotations
Skipped 130 messages with missing or invalid annotations

Sample processed message:
Text: Max, we've currently got yellows in turn 7. Ferrari in the wall, no? Yes, that's Charles stopped. We are expecting the potential of an aborted start, but just keep to your protocol at the moment.
Entities:
  - [159, 194] 'keep to your protocol at the moment' (ACTION)
  - [5, 42] 'we've currently got yellows in turn 7' (SITUATION)
  - [98, 148] 'We are expecting the potential of an aborted start' (SITUATION)
  - [44, 63] 'Ferrari in the wall' (INCIDENT)
  - [74, 96] 'that's Charles stopped' (INCIDENT)


## 6. Covert to BIO tagging format

Deeper BIO tagging format information can be searched [here](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging)).

### BIO Format Explanation

The **BIO format** is a way to label words in a sentence to indicate if they are part of a named entity, and if so, where in the entity they belong. It uses three types of labels:

- **B- (Beginning)**: The first word in an entity.
- **I- (Inside)**: Any word inside the entity that isn't the first one.
- **O (Outside)**: Words that are not part of any entity.

---

### Example Radio

Here is an example of a radio message from Max Verstappen´s track engineer: 

**Text:**  
*"Max, we've currently got yellows in turn 7. Ferrari in the wall, no? Yes, that's Charles stopped. We are expecting the potential of an aborted start, but just keep to your protocol at the moment."*

Here are the entities mentioned in the message:

1. **'keep to your protocol at the moment'** (ACTION)
2. **'we've currently got yellows in turn 7'** (SITUATION)
3. **'We are expecting the potential of an aborted start'** (SITUATION)
4. **'Ferrari in the wall'** (INCIDENT)
5. **'that's Charles stopped'** (INCIDENT)

---

### Breaking the Sentence

We break the sentence into words and then tag them as follows:

| Word            | BIO Tag          |
|-----------------|------------------|
| Max,            | O                |
| we've           | O                |
| currently       | O                |
| got             | O                |
| yellows         | O                |
| in              | O                |
| turn            | O                |
| 7.              | O                |
| Ferrari         | B-INCIDENT       |
| in              | I-INCIDENT       |
| the             | I-INCIDENT       |
| wall,           | I-INCIDENT       |
| no?             | O                |
| Yes,            | O                |
| that's          | B-INCIDENT       |
| Charles         | I-INCIDENT       |
| stopped.        | I-INCIDENT       |
| We              | B-SITUATION      |
| are             | I-SITUATION      |
| expecting       | I-SITUATION      |
| the             | I-SITUATION      |
| potential       | I-SITUATION      |
| of              | I-SITUATION      |
| an              | I-SITUATION      |
| aborted         | I-SITUATION      |
| start,          | I-SITUATION      |
| but             | O                |
| just            | O                |
| keep            | B-ACTION         |
| to              | I-ACTION         |
| your            | I-ACTION         |
| protocol        | I-ACTION         |
| at              | I-ACTION         |
| the             | I-ACTION         |
| moment.         | I-ACTION         |




In [31]:
def create_ner_tags(text, entities):
    """Convert character-based entity spans to token-based BIO tags"""
    words = text.split()
    tags = ["O"] * len(words)
    char_to_word = {}
    
    # Create mapping from character positions to word indices
    char_idx = 0
    for word_idx, word in enumerate(words):
        # Account for spaces
        if char_idx > 0:
            char_idx += 1  # Space
        
        # Map each character position to its word index
        for char_pos in range(char_idx, char_idx + len(word)):
            char_to_word[char_pos] = word_idx
        
        char_idx += len(word)
    
    # Apply entity tags
    for start_char, end_char, entity_type in entities:
        # Skip invalid spans
        if start_char >= len(text) or end_char > len(text) or start_char >= end_char:
            continue
            
        # Find word indices for start and end characters
        if start_char in char_to_word:
            start_word = char_to_word[start_char]
            # Find the last word of the entity
            end_word = char_to_word.get(end_char - 1, start_word)
            
            # Tag the first word as B-entity
            tags[start_word] = f"B-{entity_type}"
            
            # Tag subsequent words as I-entity
            for word_idx in range(start_word + 1, end_word + 1):
                tags[word_idx] = f"I-{entity_type}"
    
    return words, tags





In [32]:
def convert_to_bio_format(processed_data):
    """Convert processed data to BIO tagging format"""
    bio_data = []
    mapping_errors = 0
    
    for item in processed_data:
        text = item['text']
        entities = item['entities']
        
        # Convert to BIO tags
        words, tags = create_ner_tags(text, entities)
        
        # Check if we mapped any entities
        if all(tag == "O" for tag in tags) and len(entities) > 0:
            mapping_errors += 1
        
        bio_data.append({
            "tokens": words,
            "ner_tags": tags,
            "driver": item.get('driver', None)
        })
    
    print(f"Converted {len(bio_data)} messages to BIO format")
    print(f"Mapping errors: {mapping_errors} (messages where no entities were mapped)")
    
    # Show an example
    if bio_data:
        sample = bio_data[10]
        print("\nSample BIO tagging:")
        print(f"Original text: {' '.join(sample['tokens'])}")
        for token, tag in zip(sample['tokens'], sample['ner_tags']):
            print(f"  {token} -> {tag}")
    
    return bio_data

In [33]:
# Convert processed data to BIO format
bio_data = convert_to_bio_format(processed_f1_data)

Converted 399 messages to BIO format
Mapping errors: 0 (messages where no entities were mapped)

Sample BIO tagging:
Original text: Max, we've currently got yellows in turn 7. Ferrari in the wall, no? Yes, that's Charles stopped. We are expecting the potential of an aborted start, but just keep to your protocol at the moment.
  Max, -> O
  we've -> B-SITUATION
  currently -> I-SITUATION
  got -> I-SITUATION
  yellows -> I-SITUATION
  in -> I-SITUATION
  turn -> I-SITUATION
  7. -> I-SITUATION
  Ferrari -> B-INCIDENT
  in -> I-INCIDENT
  the -> I-INCIDENT
  wall, -> I-INCIDENT
  no? -> O
  Yes, -> O
  that's -> B-INCIDENT
  Charles -> I-INCIDENT
  stopped. -> I-INCIDENT
  We -> B-SITUATION
  are -> I-SITUATION
  expecting -> I-SITUATION
  the -> I-SITUATION
  potential -> I-SITUATION
  of -> I-SITUATION
  an -> I-SITUATION
  aborted -> I-SITUATION
  start, -> I-SITUATION
  but -> O
  just -> O
  keep -> B-ACTION
  to -> I-ACTION
  your -> I-ACTION
  protocol -> I-ACTION
  at -> I-ACTION

### What the Function Does

The function `create_ner_tags` takes the text and entities and converts them into BIO format. It starts by splitting the text into words. 

Then, it maps each word to a tag: "O" for words that are not part of an entity, "B-" for the first word of an entity, and "I-" for subsequent words inside the entity. 

The function also uses the character positions of the entities to determine which words they correspond to. Once the tags are assigned, the function returns the words and their BIO tags, ready for use in training a Named Entity Recognition (NER) model.

## 7. Create tag mappings and prepare datasets.

### 7.1 `create_tag_mappings`

This function creates mappings between NER (Named Entity Recognition) tags and unique IDs. It does this by:

1. Collecting all unique NER tags from the `bio_data`.
2. Sorting and assigning each unique tag an ID.
3. Creating two mappings:
   - `tag2id`: Maps each tag to its corresponding ID.
   - `id2tag`: Maps each ID back to its corresponding tag.

It then prints out the mappings and returns the two dictionaries: `tag2id` and `id2tag`.

**What it does:**
- Converts NER tags into unique IDs for easier processing in machine learning models.
- Helps with transforming the tags when working with model inputs and outputs.

In [34]:
def create_tag_mappings(bio_data):
    """Create mappings between NER tags and IDs"""
    unique_tags = set()
    for item in bio_data:
        unique_tags.update(item["ner_tags"])
    
    tag2id = {tag: id for id, tag in enumerate(sorted(list(unique_tags)))}
    id2tag = {id: tag for tag, id in tag2id.items()}
    
    print(f"Created mappings for {len(tag2id)} unique tags:")
    for tag, idx in tag2id.items():
        print(f"  {tag}: {idx}")
    
    return tag2id, id2tag

In [35]:
# Create tag mappings
tag2id, id2tag = create_tag_mappings(bio_data)

Created mappings for 19 unique tags:
  B-ACTION: 0
  B-INCIDENT: 1
  B-PIT_CALL: 2
  B-POSITION_CHANGE: 3
  B-SITUATION: 4
  B-STRATEGY_INSTRUCTION: 5
  B-TECHNICAL_ISSUE: 6
  B-TRACK_CONDITION: 7
  B-WEATHER: 8
  I-ACTION: 9
  I-INCIDENT: 10
  I-PIT_CALL: 11
  I-POSITION_CHANGE: 12
  I-SITUATION: 13
  I-STRATEGY_INSTRUCTION: 14
  I-TECHNICAL_ISSUE: 15
  I-TRACK_CONDITION: 16
  I-WEATHER: 17
  O: 18


---

### 7.2 `prepare_datasets`

This function prepares the dataset for training a model by splitting it into training, validation, and test sets using the Hugging Face library. Here's what it does:

1. Converts the input `bio_data` into a Hugging Face `Dataset`.
2. Splits the data into two parts: training + validation, and test.
3. Further splits the training data into training and validation sets based on the specified sizes (`test_size` and `val_size`).
4. Returns a `DatasetDict` containing the `train`, `validation`, and `test` sets.

**What it does:**
- Converts the data into a format suitable for machine learning.
- Splits the data into three parts: training, validation, and test sets for model evaluation.

In [36]:
def prepare_datasets(bio_data, test_size=0.1, val_size=0.1, seed=42):
    """Convert to Hugging Face Dataset and split into train/val/test"""
    # Convert to Hugging Face dataset
    hf_dataset = HFDataset.from_list(bio_data)
    
    # First split: train + validation vs test
    train_val_test = hf_dataset.train_test_split(test_size=test_size, seed=seed)
    
    # Second split: train vs validation (validation is val_size/(1-test_size) of the train set)
    val_fraction = val_size / (1 - test_size)
    train_val = train_val_test["train"].train_test_split(test_size=val_fraction, seed=seed)
    
    # Combine into DatasetDict
    datasets = DatasetDict({
        "train": train_val["train"],
        "validation": train_val["test"],
        "test": train_val_test["test"]
    })
    
    print(f"Prepared datasets with:")
    print(f"  - Train: {len(datasets['train'])} examples")
    print(f"  - Validation: {len(datasets['validation'])} examples")
    print(f"  - Test: {len(datasets['test'])} examples")
    
    return datasets

In [37]:
datasets = prepare_datasets(bio_data)

Prepared datasets with:
  - Train: 319 examples
  - Validation: 40 examples
  - Test: 40 examples


## 8. Tokenize and Align Labels

In [38]:
def tokenize_and_align_labels(examples, tokenizer, tag2id, max_length=128):
    """Tokenize text and align labels with model's tokens"""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=max_length
    )

    labels = []
    
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # Special tokens
            if word_idx is None:
                label_ids.append(-100)
            # First token of a word
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            # Subsequent tokens of a word
            else:
                label_ids.append(-100)
                
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [39]:
# Initialize tokenizer (smaller model for demonstration purposes)
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Tokenize a sample to demonstrate alignment
if bio_data:
    sample = {"tokens": [bio_data[0]["tokens"]], "ner_tags": [bio_data[0]["ner_tags"]]}
    tokenized_sample = tokenize_and_align_labels(sample, tokenizer, tag2id)
    
    print("Sample tokenization and label alignment:")
    print(f"Original tokens: {sample['tokens'][0][:5]}...")
    print(f"Tokenized: {tokenizer.convert_ids_to_tokens(tokenized_sample['input_ids'][0])[:10]}...")
    
    # Show how labels are aligned
    print("\nLabel alignment example (first few tokens):")
    for i in range(10):  # Show first 10 tokens
        token = tokenizer.convert_ids_to_tokens(tokenized_sample['input_ids'][0][i])
        label_id = tokenized_sample['labels'][0][i]
        label = id2tag.get(label_id, "IGNORE") if label_id != -100 else "IGNORE"
        print(f"  {token} -> {label}")

---

## 9. Defining Evaluation Metrics