## Modern Bert Finetuning

@Noah Meissner 3.06.2025

In [None]:
%pip install seqeval
%pip install datasets
%pip install transformers
%pip install torch

In [None]:
from transformers import AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizerFast, BertForTokenClassification
from datasets import Dataset
import json
from prompts import ingredients_extraction

import torch
import pandas as pd
from data_structure.DataType import DataType
from data_structure.model_name import ModelName
import os
from request.request_gemini import request
from annotate_ingredients import label_data
from tqdm import tqdm
from load_ner import ner_loader, ner_safer
from DataLoader_Ingredients import DataLoader
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


### Data Labelling
the evaluation of the NER Classification showed that Gemini 2.0 Flash hat the best performance. For that reason we use this model to label 50000 unique ingredients to finetune Modern Bert

In [None]:
data = label_data()

## Finetune Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-large")

In [None]:
def get_bio_labels(tokens, offsets, spans):
    labels = ["O"] * len(tokens)
    for start, end, label in spans:
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_start >= start and tok_end <= end:
                if tok_start == start:
                    labels[i] = f"B-{label}"
                else:
                    labels[i] = f"I-{label}"
    return labels

In [6]:
Labels = ['O', 'B-Ingredients', 'I-Ingredients', 'B-Number', 'I-Number', 'B-Type', 'I-Type', 'B-Units', 'I-Units']
label2id = {'O': 0, 'B-Ingredients': 1, 'I-Ingredients': 2, 'B-Number': 3, 'I-Number': 4, 'B-Type': 5, 'I-Type': 6, 'B-Units': 7, 'I-Units': 8}
id2label = {0: 'O', 1: 'B-Ingredients', 2: 'I-Ingredients', 3: 'B-Number', 4: 'I-Number', 5: 'B-Type', 6: 'I-Type', 7: 'B-Units', 8: 'I-Units'}

def calc_tokens(text, entities):
    spans = []
    try:
      for ent_text, ent_label in entities:
          start = text.find(ent_text)
          end = start + len(ent_text)
          spans.append((start, end, ent_label))

      encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
      tokens = encoding.tokens()
      offsets = encoding.offset_mapping

      labels = get_bio_labels(tokens, offsets, spans)
      entity = [value[0] for value in entities]
      label = [label2id[value] for value in labels]
      return {"tokens":tokens, "ner_tags":label}
    except Exception as e:
      return None

In [6]:
dataset = []
for example in data:
    text = list(example.keys())[0]
    values = list(example.values())
    entities = values[0]['entities']
    dataset.append(calc_tokens(text, entities))

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=32
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [8]:
hf_dataset = Dataset.from_list([d for d in dataset if d is not None])

In [9]:
tokenized_dataset = hf_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=hf_dataset.column_names
)

Map:   0%|          | 0/59604 [00:00<?, ? examples/s]

In [10]:
from datasets import DatasetDict
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

label2id = {'O': 0, 'B-Ingredients': 1, 'I-Ingredients': 2, 'B-Number': 3, 'I-Number': 4, 'B-Type': 5, 'I-Type': 6, 'B-Units': 7, 'I-Units': 8}
id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        true_label = []
        true_pred = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                true_label.append(id2label[l_])
                true_pred.append(id2label[p_])
        true_labels.append(true_label)
        true_predictions.append(true_pred)

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    # Learning rate scheduling
    learning_rate=2e-5,  # Slightly lower
    warmup_ratio=0.1,    # Warmup as ratio of total steps
    lr_scheduler_type="cosine_with_restarts",

    # Batch size and gradient accumulation
    per_device_train_batch_size=16,  # Increase if memory allows
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,   # Effective batch size = 32

    # Training duration
    num_train_epochs=10,  # More epochs with early stopping
    max_steps=-1,

    # Regularization
    weight_decay=0.01,    # Lower weight decay
    #dropout=0.1,

    # Optimization
    adam_epsilon=1e-8,
    max_grad_norm=1.0,

    # Logging and evaluation
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=50,
    report_to="none"
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [15]:
trainer.train()

W0528 19:51:51.738000 435 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1379,0.178019,0.896374,0.918729,0.907414
2,0.1664,0.149683,0.917246,0.931761,0.924446
3,0.134,0.150984,0.913874,0.930477,0.922101


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1379,0.178019,0.896374,0.918729,0.907414
2,0.1664,0.149683,0.917246,0.931761,0.924446
3,0.134,0.150984,0.913874,0.930477,0.922101
4,0.1595,0.178311,0.919063,0.93343,0.926191
5,0.063,0.206853,0.917676,0.93099,0.924285
6,0.0381,0.279623,0.913745,0.932017,0.922791
7,0.0388,0.338078,0.914499,0.930027,0.922198
8,0.0115,0.421547,0.911951,0.929514,0.920649
9,0.0053,0.455296,0.913371,0.930316,0.921766
10,0.0012,0.461152,0.91312,0.929738,0.921354


TrainOutput(global_step=14910, training_loss=0.08943579568343998, metrics={'train_runtime': 13817.5987, 'train_samples_per_second': 34.509, 'train_steps_per_second': 1.079, 'total_flos': 3.151780892656704e+16, 'train_loss': 0.08943579568343998, 'epoch': 10.0})

In [18]:
torch.save(model.state_dict(), 'model_weights.pt')


In [8]:
def predict_and_display(text: str, model, tokenizer, id2label: dict) -> None:
    """Führt Vorhersagen durch und zeigt Token-Label-Paare an (CPU-only)."""
    # Modell sicher auf CPU setzen
    model = model.to('cpu')
    model.eval()

    # Tokenisierung mit Offsets
    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128  # Für Konsistenz
    )

    # Inference auf CPU
    with torch.no_grad():
        outputs = model(**{k: v.to('cpu') for k, v in encoding.items() if k != "offset_mapping"})
    
    # Vorhersagen verarbeiten
    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
    
    # Token-Label-Paare rekonstruieren
    reconstructed = []
    current_word = ""
    current_label = ""
    
    for token, label_id in zip(tokens, predictions):
        label = id2label.get(label_id, 'O')  # Fallback auf 'O'
        
        # Spezialtokens überspringen
        if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
            
        # Subword-Handling (z.B. ##)
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                reconstructed.append((current_word, current_label))
            current_word = token
            current_label = label
    
    # Letztes Wort hinzufügen
    if current_word:
        reconstructed.append((current_word, current_label))

    # Ausgabe formatieren
    print(f"\n{'Token':<20} | {'Label'}")
    print("-" * 35)
    for word, label in reconstructed:
        print(f"{word:<20} | {label}")

# Beispielaufruf
predict_and_display(
    " 300ml Zucker",
    model,
    tokenizer,
    id2label
)



Token                | Label
-----------------------------------
Ġ300                 | B-Number
ml                   | B-Units
ĠZ                   | I-Ingredients
ucker                | I-Ingredients
