In [1]:
from transformers import AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizerFast, BertForTokenClassification
from datasets import Dataset
import torch
from transformers import TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
#!pip install seqeval

In [3]:
import json
with open('Gemini.json' , 'r') as f:
    data = json.load(f)

In [4]:
def get_bio_labels(tokens, offsets, spans):
    labels = ["O"] * len(tokens)
    for start, end, label in spans:
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_start >= start and tok_end <= end:
                if tok_start == start:
                    labels[i] = f"B-{label}"
                else:
                    labels[i] = f"I-{label}"
    return labels


In [5]:
Labels = ['O', 'B-Ingredients', 'I-Ingredients', 'B-Number', 'I-Number', 'B-Type', 'I-Type', 'B-Units', 'I-Units']
label2id = {'O': 0, 'B-Ingredients': 1, 'I-Ingredients': 2, 'B-Number': 3, 'I-Number': 4, 'B-Type': 5, 'I-Type': 6, 'B-Units': 7, 'I-Units': 8}
id2label = {0: 'O', 1: 'B-Ingredients', 2: 'I-Ingredients', 3: 'B-Number', 4: 'I-Number', 5: 'B-Type', 6: 'I-Type', 7: 'B-Units', 8: 'I-Units'}

def calc_tokens(text, entities):
    spans = []
    try:
      for ent_text, ent_label in entities:
          start = text.find(ent_text)
          end = start + len(ent_text)
          spans.append((start, end, ent_label))

      encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
      tokens = encoding.tokens()
      offsets = encoding.offset_mapping

      labels = get_bio_labels(tokens, offsets, spans)
      entity = [value[0] for value in entities]
      label = [label2id[value] for value in labels]
      return {"tokens":tokens, "ner_tags":label}
    except Exception as e:
      return None

In [6]:
dataset = []
for example in data:
    text = list(example.keys())[0]
    values = list(example.values())
    entities = values[0]['entities']
    dataset.append(calc_tokens(text, entities))

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=32
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [10]:
hf_dataset = Dataset.from_list([d for d in dataset if d is not None])

In [11]:
tokenized_dataset = hf_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=hf_dataset.column_names
)

Map:   0%|          | 0/9204 [00:00<?, ? examples/s]

In [12]:
from datasets import DatasetDict
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"]
})

In [13]:
model_name = "bert-base-german-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

label2id = {'O': 0, 'B-Ingredients': 1, 'I-Ingredients': 2, 'B-Number': 3, 'I-Number': 4, 'B-Type': 5, 'I-Type': 6, 'B-Units': 7, 'I-Units': 8}
id2label = {v: k for k, v in label2id.items()}

model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        true_label = []
        true_pred = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                true_label.append(id2label[l_])
                true_pred.append(id2label[p_])
        true_labels.append(true_label)
        true_predictions.append(true_pred)

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }

In [21]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1078,0.216867,0.920489,0.933248,0.926825
2,0.1295,0.229818,0.919914,0.934343,0.927072
3,0.0592,0.260711,0.918097,0.936349,0.927133
4,0.0315,0.275636,0.92178,0.937078,0.929366


TrainOutput(global_step=1844, training_loss=0.09491186304555281, metrics={'train_runtime': 273.3597, 'train_samples_per_second': 107.741, 'train_steps_per_second': 6.746, 'total_flos': 481012444664064.0, 'train_loss': 0.09491186304555281, 'epoch': 4.0})

In [33]:
def test_model(text, model, tokenizer, id2label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    inputs = {k: v.to(device) for k, v in encoding.items() if k != "offset_mapping"}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
    predicted_labels = [id2label[p.item()] for p in predictions[0]]

    reconstructed = reconstruct_tokens(tokens, predicted_labels)
    print(f"\n{'Wort':<20} | Label")
    print("-" * 35)
    for word, label in reconstructed:
        print(f"{word:<20} | {label}")


test_model("3 Eier", model, tokenizer, id2label)



Wort                 | Label
-----------------------------------
3                    | O
Eier                 | B-Units


In [29]:
def reconstruct_tokens(tokens, labels):
    words = []
    word_labels = []

    current_word = ""
    current_label = ""

    for token, label in zip(tokens, labels):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
                word_labels.append(current_label)
            current_word = token
            current_label = label

    if current_word:
        words.append(current_word)
        word_labels.append(current_label)

    return list(zip(words, word_labels))


In [34]:
model.save("model")

AttributeError: 'BertForTokenClassification' object has no attribute 'save'