In [24]:
train_path = 'deu.train.utf.txt'
dev_path = 'deu.testa.utf.txt'
test_path = 'deu.testb.utf.txt'
model_checkpoint = "bert-base-german-cased"

epoch = 3
batch_size = 16
learning_rate = 5e-5
weight_decay = 0.0

In [5]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.8.0-py3-none-any.whl (452 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py38-none-any.whl (132 kB)
Collecting xxhash
  Using cached xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
Installing collected packages: xxhash, urllib3, multiprocess, responses, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed datasets-2.8.0 multiprocess-0.70.14 responses-0.18.0 urllib3-1.26.13 xxhash-3.2.0


In [6]:
import re
import pandas as pd
from datasets import Dataset

Load dataset

In [7]:
def load_data(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == '\n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split(' ',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [8]:
traindata = load_data(train_path)
test = load_data(test_path)
dev = load_data(dev_path)

traindata.head()

Unnamed: 0,tokens,ner_tags
0,[-DOCSTART-],[O]
1,"[Ereignis, und, Erzählung, oder, :]","[O, O, O, O, O]"
2,"[Schwierigkeiten, beim, nachvollziehenden, Ver...","[O, O, O, O, I-PER, I-PER, O, O, O, O, O, I-LO..."
3,"[Diskussionen, über, Asylbewerber-, ,, Aussied...","[O, O, O, O, O, O, O, O, O, I-LOC, O, O, O, O,..."
4,"[Es, ist, deswegen, momentan, nicht, schwer, ,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [9]:
tags_train = [x for x in traindata['ner_tags']]
tags_dev = [x for x in dev['ner_tags']]
tags_test = [x for x in test['ner_tags']]
tags = tags_train + tags_dev + tags_test
label_list = list(set([item for sublist in tags for item in sublist]))
label_list

['I-LOC', 'I-MISC', 'B-LOC', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'O', 'I-ORG']

In [10]:
label2id = {}
id2label= {}
count = 0
for i in label_list:
  label2id.update({i:count})
  id2label.update({count:i})
  count += 1
label2id

{'I-LOC': 0,
 'I-MISC': 1,
 'B-LOC': 2,
 'B-ORG': 3,
 'B-MISC': 4,
 'B-PER': 5,
 'I-PER': 6,
 'O': 7,
 'I-ORG': 8}

In [25]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [26]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "I-LOC",
    "1": "I-MISC",
    "2": "B-LOC",
    "3": "B-ORG",
    "4": "B-MISC",
    "5": "B-PER",
    "6": "I-PER",
    "7": "O",
    "8": "I-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 2,
    "B-MISC": 4,
    "B-ORG": 3,
    "B-PER": 5,
    "I-LOC": 0,
    "I-MISC": 1,
    "I-ORG": 8,
    "I-PER": 6,
    "O": 7
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}



Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu"

Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/pytorch_model.bin
Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing 

In [28]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  for_bert['token_type_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      for_bert.at[index,'token_type_ids'] = inputs['token_type_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('tokens', axis = 1)
  for_bert = for_bert.drop('ner_tags', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [29]:
train_dataset = tokenized_for_bert(traindata)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 12705
})


In [30]:
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3068
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3160
})


In [31]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report

In [33]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']

In [35]:

args = TrainingArguments(
    f"bert-ner",
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay = weight_decay,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 12705
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2385
  Number of trainable parameters = 108497673


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,0.1194,0.130302,0.687326,0.830808,0.752287,1188
2,0.0433,0.121733,0.811354,0.781987,0.796399,1188
3,0.0149,0.153334,0.829437,0.806397,0.817755,1188


***** Running Evaluation *****
  Num examples = 3068
  Batch size = 16


{'LOC': {'precision': 0.7560975609756098, 'recall': 0.835016835016835, 'f1-score': 0.7936, 'support': 1188}, 'MISC': {'precision': 0.7380224260958206, 'recall': 0.6836638338054769, 'f1-score': 0.7098039215686274, 'support': 1059}, 'ORG': {'precision': 0.6873259052924791, 'recall': 0.8308080808080808, 'f1-score': 0.7522865853658537, 'support': 1188}, 'PER': {'precision': 0.9081163859111792, 'recall': 0.8459343794579173, 'f1-score': 0.8759231905465289, 'support': 1402}, 'micro avg': {'precision': 0.7723932472691162, 'recall': 0.8040107504651643, 'f1-score': 0.7878849270664505, 'support': 4837}, 'macro avg': {'precision': 0.7723905695687722, 'recall': 0.7988557822720775, 'f1-score': 0.7829034243702525, 'support': 4837}, 'weighted avg': {'precision': 0.77931197027278, 'recall': 0.8040107504651643, 'f1-score': 0.7889683542489238, 'support': 4837}}


Saving model checkpoint to bert-ner/checkpoint-795
Configuration saved in bert-ner/checkpoint-795/config.json
Model weights saved in bert-ner/checkpoint-795/pytorch_model.bin
tokenizer config file saved in bert-ner/checkpoint-795/tokenizer_config.json
Special tokens file saved in bert-ner/checkpoint-795/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 16


{'LOC': {'precision': 0.8063241106719368, 'recall': 0.8585858585858586, 'f1-score': 0.8316347329800245, 'support': 1188}, 'MISC': {'precision': 0.754, 'recall': 0.7119924457034938, 'f1-score': 0.732394366197183, 'support': 1059}, 'ORG': {'precision': 0.8113537117903931, 'recall': 0.781986531986532, 'f1-score': 0.7963994856408059, 'support': 1188}, 'PER': {'precision': 0.9054151624548736, 'recall': 0.8944365192582026, 'f1-score': 0.89989235737352, 'support': 1402}, 'micro avg': {'precision': 0.8252346193952034, 'recall': 0.8180690510647095, 'f1-score': 0.8216362126245846, 'support': 4837}, 'macro avg': {'precision': 0.8192732462293009, 'recall': 0.8117503388835217, 'f1-score': 0.8150802355478833, 'support': 4837}, 'weighted avg': {'precision': 0.8248251624657805, 'recall': 0.8180690510647095, 'f1-score': 0.8210377032379654, 'support': 4837}}


Saving model checkpoint to bert-ner/checkpoint-1590
Configuration saved in bert-ner/checkpoint-1590/config.json
Model weights saved in bert-ner/checkpoint-1590/pytorch_model.bin
tokenizer config file saved in bert-ner/checkpoint-1590/tokenizer_config.json
Special tokens file saved in bert-ner/checkpoint-1590/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 16


{'LOC': {'precision': 0.8344208809135399, 'recall': 0.8611111111111112, 'f1-score': 0.8475559237779619, 'support': 1188}, 'MISC': {'precision': 0.7413127413127413, 'recall': 0.7252124645892352, 'f1-score': 0.7331742243436755, 'support': 1059}, 'ORG': {'precision': 0.8294372294372294, 'recall': 0.8063973063973064, 'f1-score': 0.8177550149381136, 'support': 1188}, 'PER': {'precision': 0.8912228057014253, 'recall': 0.8473609129814551, 'f1-score': 0.8687385740402193, 'support': 1402}, 'micro avg': {'precision': 0.8288421052631579, 'recall': 0.8139342567707256, 'f1-score': 0.8213205382288515, 'support': 4837}, 'macro avg': {'precision': 0.824098414341234, 'recall': 0.8100204487697769, 'f1-score': 0.8168059342749926, 'support': 4837}, 'weighted avg': {'precision': 0.8292759978789136, 'recall': 0.8139342567707256, 'f1-score': 0.8213339631133011, 'support': 4837}}


Saving model checkpoint to bert-ner/checkpoint-2385
Configuration saved in bert-ner/checkpoint-2385/config.json
Model weights saved in bert-ner/checkpoint-2385/pytorch_model.bin
tokenizer config file saved in bert-ner/checkpoint-2385/tokenizer_config.json
Special tokens file saved in bert-ner/checkpoint-2385/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-ner/checkpoint-2385 (score: 0.8177550149381136).
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 16


Evaluating...


{'LOC': {'precision': 0.8344208809135399, 'recall': 0.8611111111111112, 'f1-score': 0.8475559237779619, 'support': 1188}, 'MISC': {'precision': 0.7413127413127413, 'recall': 0.7252124645892352, 'f1-score': 0.7331742243436755, 'support': 1059}, 'ORG': {'precision': 0.8294372294372294, 'recall': 0.8063973063973064, 'f1-score': 0.8177550149381136, 'support': 1188}, 'PER': {'precision': 0.8912228057014253, 'recall': 0.8473609129814551, 'f1-score': 0.8687385740402193, 'support': 1402}, 'micro avg': {'precision': 0.8288421052631579, 'recall': 0.8139342567707256, 'f1-score': 0.8213205382288515, 'support': 4837}, 'macro avg': {'precision': 0.824098414341234, 'recall': 0.8100204487697769, 'f1-score': 0.8168059342749926, 'support': 4837}, 'weighted avg': {'precision': 0.8292759978789136, 'recall': 0.8139342567707256, 'f1-score': 0.8213339631133011, 'support': 4837}}


{'eval_loss': 0.15333393216133118,
 'eval_precision': 0.8294372294372294,
 'eval_recall': 0.8063973063973064,
 'eval_f1-score': 0.8177550149381136,
 'eval_support': 1188,
 'eval_runtime': 24.3041,
 'eval_samples_per_second': 126.234,
 'eval_steps_per_second': 7.9,
 'epoch': 3.0}

In [36]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 3160
  Batch size = 16


{'LOC': {'precision': 0.8199419167473379, 'recall': 0.8183574879227054, 'f1-score': 0.8191489361702128, 'support': 1035}, 'MISC': {'precision': 0.6904422253922967, 'recall': 0.7004341534008683, 'f1-score': 0.6954022988505747, 'support': 691}, 'ORG': {'precision': 0.7599431818181818, 'recall': 0.6921086675291074, 'f1-score': 0.7244414353419092, 'support': 773}, 'PER': {'precision': 0.9231418918918919, 'recall': 0.9138795986622074, 'f1-score': 0.9184873949579833, 'support': 1196}, 'micro avg': {'precision': 0.8169519602429597, 'recall': 0.8008119079837619, 'f1-score': 0.8088014213475467, 'support': 3695}, 'macro avg': {'precision': 0.798367303962427, 'recall': 0.7811949768787221, 'f1-score': 0.78937001633017, 'support': 3695}, 'weighted avg': {'precision': 0.8165762500210362, 'recall': 0.8008119079837619, 'f1-score': 0.8083481167336836, 'support': 3695}}


PredictionOutput(predictions=array([[[-1.3023505 , -1.3258182 , -2.4154558 , ..., -0.8018884 ,
         10.354628  , -1.0028332 ],
        [-1.7940614 , -1.495341  , -2.1086407 , ..., -0.9367676 ,
         10.650852  , -1.0499656 ],
        [-1.7680511 , -1.38106   , -2.2702239 , ..., -0.96954477,
         10.919002  , -1.2187723 ],
        ...,
        [-1.9073474 , -1.4430825 , -2.3211548 , ..., -1.1604518 ,
         10.544797  , -1.2130493 ],
        [-1.908674  , -1.5698878 , -2.2367675 , ..., -1.1669112 ,
         10.811916  , -1.1650187 ],
        [-0.96890014, -0.64375067, -2.8112307 , ...,  0.11057154,
          8.361332  ,  0.27761108]],

       [[-0.84958315, -1.0594717 , -3.0888276 , ..., -0.8509164 ,
          9.176275  ,  0.66015434],
        [-0.32701004, -0.799905  , -3.184164  , ..., -0.6350412 ,
          8.787204  ,  1.0725697 ],
        [-1.1844019 , -0.7825659 , -3.2853873 , ..., -0.53801477,
         10.727225  , -0.06466811],
        ...,
        [-1.2785027 , -1.