# Experiments in training machine learning models, including data preparation, model training, and evaluation on a test dataset.

In [None]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['USE_TF'] = 'NO'

import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from nervaluate import Evaluator
import torch
import glob

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Data preparation.

In [None]:
def load_data(file_path):
    """Load data in CoNLL format from a TSV file."""
    sentences = []
    tokens, labels = [], []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            # Empty line indicates the end of a sentence
            if not line:
                if tokens:
                    # Save the completed sentence
                    sentences.append({
                        'tokens': tokens,
                        'ner_tags': labels
                    })
                    tokens, labels = [], []
            else:
                # Split the line into token and label
                parts = line.split('\t')
                
                # Expect exactly two columns: token and NER tag
                if len(parts) == 2:
                    tokens.append(parts[0])
                    labels.append(parts[1])
    
    # Add the last sentence if the file does not end with a blank line
    if tokens:
        sentences.append({
            'tokens': tokens,
            'ner_tags': labels
        })
    
    return sentences


# Load training and development datasets
print("Loading data...")
train_data = load_data("train_w.tsv")
dev_data = load_data("dev_w.tsv")

# Print dataset statistics
print(f"Train: {len(train_data)} sentences")
print(f"Dev: {len(dev_data)} sentences")

In [None]:
# Get all unique labels
all_labels = set()
for example in train_data + dev_data:
    all_labels.update(example['ner_tags'])

label_list = sorted(list(all_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(f"✓ Labels: {len(label_list)}")
print(f"  {label_list}")

# ADDED: Extract entity types for nervaluate
# Remove B- and I- prefixes to get entity types
entity_types = set()
for label in label_list:
    if label != 'O':
        entity_type = label.split('-')[1] if '-' in label else label
        entity_types.add(entity_type)

entity_types = sorted(list(entity_types))
print(f"\n Entity types: {entity_types}")

## Estbert: test and final training.

In [None]:
MODEL_NAME = '...'
OUTPUT_DIR = '...' #path to the model's folder

print(f"\nLoading {MODEL_NAME}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_and_align(examples):
    """Tokenize input tokens and align NER labels with subword tokens."""
    
    # Tokenize the input while keeping word boundaries
    tokenized = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )
    
    labels = []
    
    # Iterate over each example in the batch
    for i, label in enumerate(examples['ner_tags']):
        # Get mapping from tokens to original word indices
        word_ids = tokenized.word_ids(batch_index=i)
        
        label_ids = []
        prev_word_idx = None
        
        # Align labels with subword tokens
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens like [CLS], [SEP]
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                # First subword of a word gets the NER label
                label_ids.append(label2id[label[word_idx]])
            else:
                # Remaining subwords are ignored during loss computation
                label_ids.append(-100)
            
            prev_word_idx = word_idx
        
        labels.append(label_ids)
    
    # Attach labels to the tokenized output
    tokenized["labels"] = labels
    return tokenized


print("Preparing datasets...")

# Create Hugging Face datasets from raw data
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(dev_data)

# Apply tokenization and label alignment to training data
train_dataset = train_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Apply tokenization and label alignment to evaluation data
eval_dataset = eval_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=eval_dataset.column_names
)

print("Dataset is ready")


In [None]:
# New compute_metrics function using nervaluate
def convert_to_entities(tags):
    """
    Convert BIO tags to entity list format required by nervaluate.
    
    Args:
        tags: List of BIO tags for one sentence
    
    Returns:
        List of entity dictionaries with 'label', 'start', 'end'
    """
    entities = []
    current_entity = None
    
    for i, tag in enumerate(tags):
        if tag == 'O':
            if current_entity is not None:
                entities.append(current_entity)
                current_entity = None
        elif tag.startswith('B-'):
            if current_entity is not None:
                entities.append(current_entity)
            entity_type = tag[2:]  # Remove 'B-' prefix
            current_entity = {
                'label': entity_type,
                'start': i,
                'end': i + 1
            }
        elif tag.startswith('I-'):
            if current_entity is not None:
                current_entity['end'] = i + 1
            else:
                # I- tag without B- tag, treat as new entity
                entity_type = tag[2:]  # Remove 'I-' prefix
                current_entity = {
                    'label': entity_type,
                    'start': i,
                    'end': i + 1
                }
    
    if current_entity is not None:
        entities.append(current_entity)
    
    return entities


def compute_metrics(p):
    """
    Compute metrics using nervaluate.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Convert predictions and labels to BIO tags
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    
    # Convert BIO tags to entity format for nervaluate
    true_entities = [convert_to_entities(tags) for tags in true_labels]
    pred_entities = [convert_to_entities(tags) for tags in true_predictions]
    
    # Create evaluator
    evaluator = Evaluator(true_entities, pred_entities, tags=entity_types)
    
    results = evaluator.evaluate()
    overall = results['overall']
    
    print(results.keys())
    
    # Extract strict metrics
    strict_results = overall['strict']
    #print(type(strict_results))
    
    return {
        'precision': strict_results.precision,
        'recall': strict_results.recall,
        'f1': strict_results.f1,
        # Optional: Add partial matching scores
        'partial_precision': overall['partial'].precision,
        'partial_recall': overall['partial'].recall,
        'partial_f1': overall['partial'].f1,
        
    }

print("Metrics function ready (using nervaluate)")
print("Evaluation scheme: strict (exact boundary + correct type)")
print("Aggregation: micro-average (default in nervaluate)")

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def arguments(learning_r, batch):
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/lr{learning_r}_bs{batch}",
        learning_rate=learning_r,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=100,
        save_total_limit=2,
        push_to_hub=False,
        report_to="none",
    )
    return training_args

print("Training arguments configured")

Tests:

In [None]:
learning_rates = [1e-5, 5e-6, 1e-6, 5e-5]
batch_sizes = [16, 24, 32]
results_list = []

for l in learning_rates:
    for b in batch_sizes:
        trainer = Trainer(
            model=AutoModelForTokenClassification.from_pretrained(
                MODEL_NAME,
                num_labels=len(label2id),
                id2label=id2label,
                label2id=label2id
            ),
            args=arguments(l, b),
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        print("Trainer created")
        print("\n" + "="*80)
        print("STARTING TRAINING - EstBERT")
        print("="*80)
        print(f"Model: {MODEL_NAME}")
        print(f"Training samples: {len(train_data)}")
        print(f"Dev samples: {len(dev_data)}")
        print(f"Epochs: 3")
        print(f"Batchsize:", b)
        print(f"Learning rate:", l)
        print("="*80 + "\n")

        trainer.train()

        print(" Training complete!")
        print("\n" + "="*80)
        print("EVALUATION")
        print("="*80)

        results = trainer.evaluate()

        print(f"\nResults:")
        print(f"  Precision: {results['eval_precision']:.4f}")
        print(f"  Recall:    {results['eval_recall']:.4f}")
        print(f"  F1 Score:  {results['eval_f1']:.4f}")
        print(f"  Loss:      {results['eval_loss']:.4f}")
        print("="*80)

        results_list.append({
            "learning_rate": l,
            "batch_size": b,
            "precision": results["eval_precision"],
            "recall": results["eval_recall"],
            "f1": results["eval_f1"],
            "loss": results["eval_loss"]
        })

Looking for the best parameters:

In [None]:
pd.DataFrame(results_list).sort_values("f1", ascending=False)

Training with the best parameters:

In [None]:
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/best",
    learning_rate=5e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,  #limit
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    ),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=4,    # stop after N epochs without improvement
            early_stopping_threshold=0.0
        )
    ],
)

print("Trainer created")
print("\n" + "="*80)
print("STARTING TRAINING - EstBERT")
print("="*80)
print(f"Model: {MODEL_NAME}")
print(f"Training samples: {len(train_data)}")
print(f"Dev samples: {len(dev_data)}")
print(f"Epochs: ?")
print(f"Batchsize: 16")
print(f"Learning rate: 5e-05")
print("="*80 + "\n")

trainer.train()
print("Training complete!")

In [None]:
print("\n" + "="*80)
print("EVALUATION")
print("="*80)

results = trainer.evaluate()

print(f"\nResults:")
print(f"  Precision: {results['eval_precision']:.4f}")
print(f"  Recall:    {results['eval_recall']:.4f}")
print(f"  F1 Score:  {results['eval_f1']:.4f}")
print(f"  Loss:      {results['eval_loss']:.4f}")
print("="*80)

# Save results
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(f"{OUTPUT_DIR}/results.txt", 'w') as f:
    f.write(f"EstBERT NER Results\n")
    f.write(f"="*50 + "\n")
    f.write(f"Precision: {results['eval_precision']:.4f}\n")
    f.write(f"Recall: {results['eval_recall']:.4f}\n")
    f.write(f"F1 Score: {results['eval_f1']:.4f}\n")
    f.write(f"Loss: {results['eval_loss']:.4f}\n")

print(f"\nResults saved to {OUTPUT_DIR}/results.txt")
print(f"Model saved to {OUTPUT_DIR}/")

## Est-roberta: tests and final training.

In [None]:
MODEL_NAME_ROBERTA = '...'
OUTPUT_DIR_ROBERTA = '...'

print(f"Loading {MODEL_NAME_ROBERTA}...")

tokenizer_roberta = AutoTokenizer.from_pretrained(MODEL_NAME_ROBERTA)
print("Est-RoBERTa loaded")

model_roberta = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME_ROBERTA,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

In [None]:
def tokenize_and_align_roberta(examples):
    """Tokenize input tokens and align NER labels for RoBERTa-based models."""
    
    # Tokenize while preserving word boundaries
    tokenized = tokenizer_roberta(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )
    
    labels = []
    
    # Iterate over each example in the batch
    for i, label in enumerate(examples['ner_tags']):
        # Map each subword token to its original word index
        word_ids = tokenized.word_ids(batch_index=i)
        
        label_ids = []
        prev_word_idx = None
        
        # Align word-level NER labels with subword tokens
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens (e.g. <s>, </s>)
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                # Assign label only to the first subword of each word
                label_ids.append(label2id[label[word_idx]])
            else:
                # Ignore subsequent subwords during loss computation
                label_ids.append(-100)
            
            prev_word_idx = word_idx
        
        labels.append(label_ids)
    
    # Attach aligned labels to the tokenized output
    tokenized["labels"] = labels
    return tokenized


print("Preparing datasets for Est-RoBERTa...")

# Create Hugging Face datasets for RoBERTa
train_dataset_roberta = Dataset.from_list(train_data)
eval_dataset_roberta = Dataset.from_list(dev_data)

# Apply tokenization and label alignment to training data
train_dataset_roberta = train_dataset_roberta.map(
    tokenize_and_align_roberta,
    batched=True,
    remove_columns=train_dataset_roberta.column_names
)

# Apply tokenization and label alignment to evaluation data
eval_dataset_roberta = eval_dataset_roberta.map(
    tokenize_and_align_roberta,
    batched=True,
    remove_columns=eval_dataset_roberta.column_names
)

print("Datasets ready for Est-RoBERTa")


In [None]:
def args_roberta(learning_r, batch):
    training_args_roberta = TrainingArguments(
        output_dir=f"{OUTPUT_DIR_ROBERTA}/lr{learning_r}_bs{batch}",
        learning_rate=learning_r,
        per_device_train_batch_size=batch, 
        per_device_eval_batch_size=batch,   
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=100,
        save_total_limit=2,
        push_to_hub=False,
        save_steps=500,
        report_to="none",
    )
    return training_args_roberta

Tests:

In [None]:
learning_rates = [1e-5, 5e-6, 1e-6, 5e-5]
batch_sizes = [16, 24, 32]
results_list_roberta = []

data_collator_roberta = DataCollatorForTokenClassification(tokenizer_roberta)
compute_metrics_roberta = compute_metrics

for lr in learning_rates:
    for bs in batch_sizes:
        trainer_roberta = Trainer(
            model=AutoModelForTokenClassification.from_pretrained(
                MODEL_NAME_ROBERTA,
                num_labels=len(label2id),
                id2label=id2label,
                label2id=label2id
            ),
            args=args_roberta(lr, bs),
            train_dataset=train_dataset_roberta,
            eval_dataset=eval_dataset_roberta,
            tokenizer=tokenizer_roberta,
            data_collator=data_collator_roberta,
            compute_metrics=compute_metrics_roberta,
        )
        print("\n" + "="*80)
        print("STARTING TRAINING - Est-RoBERTa")
        print("="*80)

        
        print(f"Model: {MODEL_NAME_ROBERTA}")
        print(f"Training samples: {len(train_data)}")
        print(f"Dev samples: {len(dev_data)}")
        print(f"Epochs: 3")
        print(f"Batch size:", bs)
        print(f"Learning rate:", lr)
        print("="*80 + "\n")
        
        trainer_roberta.train()
        
        print("\n✓ Est-RoBERTa training complete!")
        
        print("✓ Est-RoBERTa trainer created")
        print("\n" + "="*80)
        print("EVALUATION - Est-RoBERTa")
        print("="*80)
        
        results_roberta = trainer_roberta.evaluate()
        
        print(f"\nEst-RoBERTa Results:")
        print(f"  Precision: {results_roberta['eval_precision']:.4f}")
        print(f"  Recall:    {results_roberta['eval_recall']:.4f}")
        print(f"  F1 Score:  {results_roberta['eval_f1']:.4f}")
        print(f"  Loss:      {results_roberta['eval_loss']:.4f}")
        print("="*80)

        results_list_roberta.append({
            "learning_rate": lr,
            "batch_size": bs,
            "precision": results_roberta["eval_precision"],
            "recall": results_roberta["eval_recall"],
            "f1": results_roberta["eval_f1"],
            "loss": results_roberta["eval_loss"]
        })

Looking for the best parameters:

In [None]:
pd.DataFrame(results_list_roberta).sort_values("f1", ascending=False)

Training with the best parameters:

In [None]:
training_args_roberta = TrainingArguments(
    output_dir=f"{OUTPUT_DIR_ROBERTA}/best",
    learning_rate=5e-5,
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,   
    num_train_epochs=50,          #limit
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",      
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
)

trainer_roberta = Trainer(
    model=model_roberta,
    args=training_args_roberta,
    train_dataset=train_dataset_roberta,
    eval_dataset=eval_dataset_roberta,
    tokenizer=tokenizer_roberta,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics_roberta,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=4,   # stop after N epochs without improvement
            early_stopping_threshold=0.0
        )
    ],
)
print("\n" + "="*80)
print("STARTING TRAINING - Est-RoBERTa")
print("="*80)

print(f"Model: {MODEL_NAME_ROBERTA}")
print(f"Training samples: {len(train_data)}")
print(f"Dev samples: {len(dev_data)}")
print(f"Epochs: ?")
print(f"Batch size: 16")
print(f"Learning rate: 5e-5")
print("="*80 + "\n")

trainer_roberta.train()

print("\n Est-RoBERTa training complete!")

In [None]:
print("\n" + "="*80)
print("EVALUATION - Est-RoBERTa")
print("="*80)

results_roberta = trainer_roberta.evaluate()

print(f"\nEst-RoBERTa Results:")
print(f"  Precision: {results_roberta['eval_precision']:.4f}")
print(f"  Recall:    {results_roberta['eval_recall']:.4f}")
print(f"  F1 Score:  {results_roberta['eval_f1']:.4f}")
print(f"  Loss:      {results_roberta['eval_loss']:.4f}")
print("="*80)

# Save results
os.makedirs(OUTPUT_DIR_ROBERTA, exist_ok=True)
with open(f"{OUTPUT_DIR_ROBERTA}/results.txt", 'w') as f:
    f.write(f"Est-RoBERTa NER Results\n")
    f.write(f"="*50 + "\n")
    f.write(f"Precision: {results_roberta['eval_precision']:.4f}\n")
    f.write(f"Recall: {results_roberta['eval_recall']:.4f}\n")
    f.write(f"F1 Score: {results_roberta['eval_f1']:.4f}\n")
    f.write(f"Loss: {results_roberta['eval_loss']:.4f}\n")

print(f"\n Est-RoBERTa results saved to {OUTPUT_DIR_ROBERTA}/results.txt")

## Wikibert: tests and final training.

In [None]:
from transformers import BertTokenizer, BertModel, BertForTokenClassification

In [None]:
OUTPUT_DIR = '...'
tokenizer = BertTokenizer.from_pretrained('...')

In [None]:
def tokenize_and_align(examples):
    # Maximum sequence length for the model
    MAX_LEN = 512

    # Lists to store processed inputs for the whole batch
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    # Iterate over tokenized sentences and their NER tags
    for words, tags in zip(examples["tokens"], examples["ner_tags"]):

        # Start with [CLS] token
        input_ids = [tokenizer.cls_token_id]
        # Label -100 means "ignore this token in loss computation"
        labels = [-100]

        # Process each word and its corresponding NER tag
        for word, tag in zip(words, tags):
            # Tokenize the word into subword tokens
            word_tokens = tokenizer.tokenize(word)

            # Skip words that produce no tokens
            if not word_tokens:
                continue

            # Convert subword tokens to token IDs
            word_token_ids = tokenizer.convert_tokens_to_ids(word_tokens)

            # Add subword token IDs to the input sequence
            input_ids.extend(word_token_ids)

            # Assign the label only to the first subword token
            labels.append(label2id[tag])
            # Ignore remaining subword tokens
            labels.extend([-100] * (len(word_tokens) - 1))

        # Add [SEP] token at the end
        input_ids.append(tokenizer.sep_token_id)
        labels.append(-100)

        # Attention mask: 1 for real tokens, 0 for padding
        attention_mask = [1] * len(input_ids)

        # Truncate sequences longer than MAX_LEN
        if len(input_ids) > MAX_LEN:
            input_ids = input_ids[:MAX_LEN]
            attention_mask = attention_mask[:MAX_LEN]
            labels = labels[:MAX_LEN]

        # Calculate how much padding is needed
        pad_len = MAX_LEN - len(input_ids)

        # Pad input_ids, attention_mask, and labels to MAX_LEN
        if pad_len > 0:
            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
            attention_mask = attention_mask + [0] * pad_len
            labels = labels + [-100] * pad_len

        # Save processed example
        all_input_ids.append(input_ids)
        all_attention_masks.append(attention_mask)
        all_labels.append(labels)

    # Return a dictionary compatible with Hugging Face datasets
    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    }


# Create Hugging Face datasets from raw data
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(dev_data)

# Apply tokenization and label alignment to the training dataset
train_dataset = train_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Apply tokenization and label alignment to the evaluation dataset
eval_dataset = eval_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=eval_dataset.column_names
)


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
def arguments(learning_r, batch):
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/lr{learning_r}_bs{batch}",
        learning_rate=learning_r,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=100,
        save_total_limit=2,
        push_to_hub=False,
        report_to="none",
    )
    return training_args

print("Training arguments configured")

Tests:

In [None]:
learning_rates = [1e-5, 5e-6, 1e-6, 5e-5]
batch_sizes = [16, 24, 32]
results_list = []

for l in learning_rates:
    for b in batch_sizes:
        trainer = Trainer(
            BertForTokenClassification.from_pretrained(
                '...',
                num_labels=len(label2id),
                id2label=id2label,
                label2id=label2id
            ),
            args=arguments(l, b),
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        print("Trainer created")
        print("\n" + "="*80)
        print("STARTING TRAINING - EstBERT")
        print("="*80)
        print(f"Model: TurkuNLP/wikibert-base-et-cased")
        print(f"Training samples: {len(train_data)}")
        print(f"Dev samples: {len(dev_data)}")
        print(f"Epochs: 3")
        print(f"Batchsize:", b)
        print(f"Learning rate:", l)
        print("="*80 + "\n")

        trainer.train()

        print("\n Training complete!")
        print("\n" + "="*80)
        print("EVALUATION")
        print("="*80)

        results = trainer.evaluate()

        print(f"\nResults:")
        print(f"  Precision: {results['eval_precision']:.4f}")
        print(f"  Recall:    {results['eval_recall']:.4f}")
        print(f"  F1 Score:  {results['eval_f1']:.4f}")
        print(f"  Loss:      {results['eval_loss']:.4f}")
        print("="*80)

        results_list.append({
            "learning_rate": l,
            "batch_size": b,
            "precision": results["eval_precision"],
            "recall": results["eval_recall"],
            "f1": results["eval_f1"],
            "loss": results["eval_loss"]
        })

Looking for the best parameters:

In [None]:
pd.DataFrame(results_list).sort_values("f1", ascending=False)

Training with the best parameters:

In [None]:
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/best",
    learning_rate=5e-05,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=50,          #limit
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=BertForTokenClassification.from_pretrained(
        "TurkuNLP/wikibert-base-et-cased",
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    ),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=4,
            early_stopping_threshold=0.0
        )
    ],
)

print("Trainer created")
print("\n" + "="*80)
print("STARTING TRAINING - EstBERT")
print("="*80)
print(f"Model: wikibert")
print(f"Training samples: {len(train_data)}")
print(f"Dev samples: {len(dev_data)}")
print(f"Epochs: ?")
print(f"Batchsize: 24 ")
print(f"Learning rate: 5e-05")
print("="*80 + "\n")

trainer.train()
print("\nTraining complete!")

In [None]:
print("\n" + "="*80)
print("EVALUATION")
print("="*80)

results = trainer.evaluate()

print(f"\nResults:")
print(f"  Precision: {results['eval_precision']:.4f}")
print(f"  Recall:    {results['eval_recall']:.4f}")
print(f"  F1 Score:  {results['eval_f1']:.4f}")
print(f"  Loss:      {results['eval_loss']:.4f}")
print("="*80)

# Save results
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(f"{OUTPUT_DIR}/results.txt", 'w') as f:
    f.write(f"EstBERT NER Results\n")
    f.write(f"="*50 + "\n")
    f.write(f"Precision: {results['eval_precision']:.4f}\n")
    f.write(f"Recall: {results['eval_recall']:.4f}\n")
    f.write(f"F1 Score: {results['eval_f1']:.4f}\n")
    f.write(f"Loss: {results['eval_loss']:.4f}\n")

print(f"\n Results saved to {OUTPUT_DIR}/results.txt")
print(f"Model saved to {OUTPUT_DIR}/")

## Evaluation of models trained on a set of non-normalised data.

In [None]:
from tqdm import tqdm

### Estbert

In [None]:
model_path = "..."  # checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()  # evaluation

In [None]:
test_data = load_data('...')

def tokenize_and_align(examples):
    """Tokenize and align labels."""
    tokenized = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=True,  
        max_length=512  # max length
    )
    
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        prev_word_idx = None
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized["labels"] = labels
    return tokenized

In [None]:
def compute_metrics(p):
    """
    Compute metrics using nervaluate.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Convert predictions and labels to BIO tags
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    
    # Convert BIO tags to entity format for nervaluate
    true_entities = [convert_to_entities(tags) for tags in true_labels]
    pred_entities = [convert_to_entities(tags) for tags in true_predictions]
    
    # Create evaluator
    evaluator = Evaluator(true_entities, pred_entities, tags=entity_types)
    results = evaluator.evaluate()
    
    overall = results['overall']
    entities = results['entities']
    
    strict = overall['strict']
    partial = overall['partial']

    metrics = {
        # common metrics
        'precision': strict.precision,
        'recall': strict.recall,
        'f1': strict.f1,
        'partial_precision': partial.precision,
        'partial_recall': partial.recall,
        'partial_f1': partial.f1,
    }

    # entities
    for ent_type in entity_types:
        if ent_type in entities:
            m = entities[ent_type]['strict']
            metrics[f'{ent_type}_precision'] = m.precision
            metrics[f'{ent_type}_recall'] = m.recall
            metrics[f'{ent_type}_f1'] = m.f1
        else:
            metrics[f'{ent_type}_precision'] = 0.0
            metrics[f'{ent_type}_recall'] = 0.0
            metrics[f'{ent_type}_f1'] = 0.0

    return metrics


print(" Metrics function ready (using nervaluate)")
print("  Evaluation scheme: strict (exact boundary + correct type)")
print("  Aggregation: micro-average (default in nervaluate)")

In [None]:
os.environ["WANDB_DISABLED"] = "true"
print("Preparing test dataset...")
test_dataset = Dataset.from_list(test_data)
test_dataset = test_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=test_dataset.column_names
)
print(" Test dataset ready")

# data collator for token classification
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100 #standart parameter
)

#Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
)

# Evaluation
print("\n" + "="*70)
print("EVALUATION ON THE TEST SET")
print("="*70)

test_results = trainer.evaluate(test_dataset)

# RESULTS
print("\n" + "="*70)
print("RESULTS ON THE TEST SET")
print("="*70)
print("\nStrict matching (exact boundary + correct type):")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall:    {test_results['eval_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_f1']:.4f}")

print("\nPartial matching:")
print(f"  Precision: {test_results['eval_partial_precision']:.4f}")
print(f"  Recall:    {test_results['eval_partial_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_partial_f1']:.4f}")

print(f"\nLoss: {test_results['eval_loss']:.4f}")
print("="*70)

In [None]:
print("\n=== Overall ===")
print(f"Strict F1: {test_results['eval_f1']:.4f}")
print(f"Partial F1: {test_results['eval_partial_f1']:.4f}")

print("\n=== Per-entity results (strict) ===")
for ent in entity_types:
    print(f"{ent:10s}  P={test_results.get(f'eval_{ent}_precision', 0):.4f}  "
          f"R={test_results.get(f'eval_{ent}_recall', 0):.4f}  "
          f"F1={test_results.get(f'eval_{ent}_f1', 0):.4f}")

### Est-roberta

In [None]:
model_path = '...'  # checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()  # evaluation

In [None]:
os.environ["WANDB_DISABLED"] = "true"
print("Preparing test dataset...")
test_dataset = Dataset.from_list(test_data)
test_dataset = test_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=test_dataset.column_names
)
print("Test dataset ready")

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100 
)

#Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,  
    compute_metrics=compute_metrics,
)


print("\n" + "="*70)
print("START EVALUATION ON THE TEST SET")
print("="*70)

test_results = trainer.evaluate(test_dataset)


print("\n" + "="*70)
print("RESULTS ON THE TEST SET")
print("="*70)
print("\nStrict matching (exact boundary + correct type):")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall:    {test_results['eval_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_f1']:.4f}")

print("\nPartial matching:")
print(f"  Precision: {test_results['eval_partial_precision']:.4f}")
print(f"  Recall:    {test_results['eval_partial_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_partial_f1']:.4f}")

print(f"\nLoss: {test_results['eval_loss']:.4f}")
print("="*70)

In [None]:
print("\n=== Overall ===")
print(f"Strict F1: {test_results['eval_f1']:.4f}")
print(f"Partial F1: {test_results['eval_partial_f1']:.4f}")

print("\n=== Per-entity results (strict) ===")
for ent in entity_types:
    print(f"{ent:10s}  P={test_results.get(f'eval_{ent}_precision', 0):.4f}  "
          f"R={test_results.get(f'eval_{ent}_recall', 0):.4f}  "
          f"F1={test_results.get(f'eval_{ent}_f1', 0):.4f}")

### Wikibert

In [None]:
model_path = '...'  #checkpoint
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)
model.eval()  

In [None]:
def tokenize_and_align(examples):
    # Maximum sequence length for the model
    MAX_LEN = 512

    # Lists to store processed inputs for the whole batch
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    # Iterate over tokenized sentences and their NER tags
    for words, tags in zip(examples["tokens"], examples["ner_tags"]):

        # Start with [CLS] token
        input_ids = [tokenizer.cls_token_id]
        # Label -100 means "ignore this token in loss computation"
        labels = [-100]

        # Process each word and its corresponding NER tag
        for word, tag in zip(words, tags):
            # Tokenize the word into subword tokens
            word_tokens = tokenizer.tokenize(word)

            # Skip words that produce no tokens
            if not word_tokens:
                continue

            # Convert subword tokens to token IDs
            word_token_ids = tokenizer.convert_tokens_to_ids(word_tokens)

            # Add subword token IDs to the input sequence
            input_ids.extend(word_token_ids)

            # Assign the label only to the first subword token
            labels.append(label2id[tag])
            # Ignore remaining subword tokens
            labels.extend([-100] * (len(word_tokens) - 1))

        # Add [SEP] token at the end
        input_ids.append(tokenizer.sep_token_id)
        labels.append(-100)

        # Attention mask: 1 for real tokens, 0 for padding
        attention_mask = [1] * len(input_ids)

        # Truncate sequences longer than MAX_LEN
        if len(input_ids) > MAX_LEN:
            input_ids = input_ids[:MAX_LEN]
            attention_mask = attention_mask[:MAX_LEN]
            labels = labels[:MAX_LEN]

        # Calculate how much padding is needed
        pad_len = MAX_LEN - len(input_ids)

        # Pad input_ids, attention_mask, and labels to MAX_LEN
        if pad_len > 0:
            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
            attention_mask = attention_mask + [0] * pad_len
            labels = labels + [-100] * pad_len

        # Save processed example
        all_input_ids.append(input_ids)
        all_attention_masks.append(attention_mask)
        all_labels.append(labels)

    # Return a dictionary compatible with Hugging Face datasets
    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    }


# Create Hugging Face datasets from raw data
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(dev_data)

# Apply tokenization and label alignment to the training dataset
train_dataset = train_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Apply tokenization and label alignment to the evaluation dataset
eval_dataset = eval_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=eval_dataset.column_names
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"
print("Preparing test dataset...")
test_dataset = Dataset.from_list(test_data)
test_dataset = test_dataset.map(
    tokenize_and_align,
    batched=True,
    remove_columns=test_dataset.column_names
)
print("Test dataset ready")


data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100  
)

# Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,  
    compute_metrics=compute_metrics,
)

# Evaluation
print("\n" + "="*70)
print("EVALUATION")
print("="*70)

test_results = trainer.evaluate(test_dataset)

# Results
print("\n" + "="*70)
print("RESULTS ON TEST SET")
print("="*70)
print("\nStrict matching (exact boundary + correct type):")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall:    {test_results['eval_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_f1']:.4f}")

print("\nPartial matching:")
print(f"  Precision: {test_results['eval_partial_precision']:.4f}")
print(f"  Recall:    {test_results['eval_partial_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_partial_f1']:.4f}")

print(f"\nLoss: {test_results['eval_loss']:.4f}")
print("="*70)

In [None]:
print("\n=== Overall ===")
print(f"Strict F1: {test_results['eval_f1']:.4f}")
print(f"Partial F1: {test_results['eval_partial_f1']:.4f}")

print("\n=== Per-entity results (strict) ===")
for ent in entity_types:
    print(f"{ent:10s}  P={test_results.get(f'eval_{ent}_precision', 0):.4f}  "
          f"R={test_results.get(f'eval_{ent}_recall', 0):.4f}  "
          f"F1={test_results.get(f'eval_{ent}_f1', 0):.4f}")

### EstBERT_NER

In [None]:
test_data = load_data('...')

def tokenize_and_align(examples):
    print("CALLED tokenize_and_align, sample tags:", examples["ner_tags"][0][:5])

    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        max_length=512,
    )

    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        prev_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                tag = label_seq[word_idx]
                if tag not in model.config.label2id:
                    tag = "O"
                label_ids.append(model.config.label2id[tag])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx

        labels.append(label_ids)

    tokenized = dict(tokenized)     
    tokenized["labels"] = labels
    return tokenized

In [None]:
def convert_to_entities(tags):
    """
    Convert BIO tags to entity list format required by nervaluate.
    
    Args:
        tags: List of BIO tags for one sentence
    
    Returns:
        List of entity dictionaries with 'label', 'start', 'end'
    """
    entities = []
    current_entity = None
    
    for i, tag in enumerate(tags):
        if tag == 'O':
            if current_entity is not None:
                entities.append(current_entity)
                current_entity = None
        elif tag.startswith('B-'):
            if current_entity is not None:
                entities.append(current_entity)
            entity_type = tag[2:]  # Remove 'B-' prefix
            current_entity = {
                'label': entity_type,
                'start': i,
                'end': i + 1
            }
        elif tag.startswith('I-'):
            if current_entity is not None:
                current_entity['end'] = i + 1
            else:
                # I- tag without B- tag, treat as new entity
                entity_type = tag[2:]  # Remove 'I-' prefix
                current_entity = {
                    'label': entity_type,
                    'start': i,
                    'end': i + 1
                }
    
    if current_entity is not None:
        entities.append(current_entity)
    
    return entities
    
def compute_metrics(p):
    import numpy as np
    from nervaluate import Evaluator

    logits, labels = p
    preds = np.argmax(logits, axis=2)

    id2label = model.config.id2label

    # entities PER/ORG/LOC...
    entity_types = sorted({
        lab.split("-", 1)[1] for lab in id2label.values() if "-" in lab
    })

    true_predictions = [
        [id2label[int(pred)] for (pred, lab) in zip(pred_row, lab_row) if lab != -100]
        for pred_row, lab_row in zip(preds, labels)
    ]
    true_labels = [
        [id2label[int(lab)] for (pred, lab) in zip(pred_row, lab_row) if lab != -100]
        for pred_row, lab_row in zip(preds, labels)
    ]

    true_entities = [convert_to_entities(tags) for tags in true_labels]
    pred_entities = [convert_to_entities(tags) for tags in true_predictions]

    evaluator = Evaluator(true_entities, pred_entities, tags=entity_types)
    results = evaluator.evaluate()

    overall = results["overall"]
    entities = results["entities"]
    strict = overall["strict"]
    partial = overall["partial"]

    metrics = {
        "precision": strict.precision,
        "recall": strict.recall,
        "f1": strict.f1,
        "partial_precision": partial.precision,
        "partial_recall": partial.recall,
        "partial_f1": partial.f1,
    }

    for ent_type in entity_types:
        if ent_type in entities:
            m = entities[ent_type]["strict"]
            metrics[f"{ent_type}_precision"] = m.precision
            metrics[f"{ent_type}_recall"] = m.recall
            metrics[f"{ent_type}_f1"] = m.f1
        else:
            metrics[f"{ent_type}_precision"] = 0.0
            metrics[f"{ent_type}_recall"] = 0.0
            metrics[f"{ent_type}_f1"] = 0.0

    return metrics



print(" Universal metrics function ready (nervaluate-based)")
print("   Auto-detects labels from model")
print("   Evaluation scheme: strict + partial")
print("   Aggregation: micro-average (nervaluate default)")


In [None]:
model_name = '...'
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("num_labels:", model.config.num_labels)
print("label2id:", model.config.label2id)
print("id2label:", model.config.id2label)

In [None]:
os.environ["WANDB_DISABLED"] = "true"
print("Preparing test dataset...")
test_dataset = Dataset.from_list(test_data)
test_dataset = test_dataset.map(tokenize_and_align, batched=True, remove_columns=test_dataset.column_names)

print("Test dataset ready")

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100 
)

#Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,  
    compute_metrics=compute_metrics,
)

# Evaluation
print("\n" + "="*70)
print("EVALUATION")
print("="*70)

test_results = trainer.evaluate(test_dataset)

# Results
print("\n" + "="*70)
print("RESULTS ON TEST SET")
print("="*70)
print("\nStrict matching (exact boundary + correct type):")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall:    {test_results['eval_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_f1']:.4f}")

print("\nPartial matching:")
print(f"  Precision: {test_results['eval_partial_precision']:.4f}")
print(f"  Recall:    {test_results['eval_partial_recall']:.4f}")
print(f"  F1 Score:  {test_results['eval_partial_f1']:.4f}")

print(f"\nLoss: {test_results['eval_loss']:.4f}")
print("="*70)

In [None]:
print("\n=== Overall ===")
print(f"Strict F1: {test_results['eval_f1']:.4f}")
print(f"Partial F1: {test_results['eval_partial_f1']:.4f}")

print("\n=== Per-entity results (strict) ===")
for ent in entity_types:
    print(f"{ent:10s}  P={test_results.get(f'eval_{ent}_precision', 0):.4f}  "
          f"R={test_results.get(f'eval_{ent}_recall', 0):.4f}  "
          f"F1={test_results.get(f'eval_{ent}_f1', 0):.4f}")