In [30]:
train_path = 'skweak_labelled_20_all.txt'

dev_path = 'dev_hand_fin.txt'
test_path = 'test_hand_fin.txt'

model_checkpoint = "bert-base-german-cased"

batch_size = 16
learning_rate = 5e-5
epoch = 4
weight_decay = 0.5

save_folder_name = 'bert_skweak'
save_model_name = 'bert_skweak_fin'

In [7]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import re
import pandas as pd
from datasets import Dataset

Load dataset

In [21]:
def load_data_weak(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == ' \t \n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split('\t',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [9]:
def load_data(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == '\n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split('\t',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [22]:
traindata = load_data_weak(train_path)
test = load_data(test_path)
dev = load_data(dev_path)

traindata.head()

Unnamed: 0,tokens,ner_tags
0,"[Maßnahmenbekanntgabe, zu, MA, 37, ,, Prüfung,...","[O, O, B-ORG, I-ORG, O, O, O, O]"
1,[INHALTSVERZEICHNIS],[O]
2,[ABKÜRZUNGSVERZEICHNIS],[O]
3,"[bzw., beziehungsweise, Nr., Nummer]","[O, O, O, O]"
4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, B..."


In [11]:
label2id = {'O': 0,'B-ORG':1,'I-ORG':2}
id2label = {0 :'O', 1:'B-ORG', 2:'I-ORG'}
label_list = ['O','B-ORG','I-ORG']

In [12]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [13]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

In [23]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  for_bert['token_type_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      for_bert.at[index,'token_type_ids'] = inputs['token_type_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('tokens', axis = 1)
  for_bert = for_bert.drop('ner_tags', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [24]:
train_dataset = tokenized_for_bert(traindata)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 9182
})


In [25]:
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1227
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1226
})


In [26]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=d97b7cc2a3ed5e629303157f9e2d0271d4797885b732c7c6343d94002d915484
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [31]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report

In [32]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']

In [33]:
args = TrainingArguments(
    save_folder_name,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay = weight_decay,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

***** Running training *****
  Num examples = 9182
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2296
  Number of trainable parameters = 108493059
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,0.0314,0.155738,0.620886,0.65467,0.637331,2334
2,0.0116,0.201438,0.587951,0.677378,0.629504,2334
3,0.0063,0.231406,0.603494,0.680805,0.639823,2334
4,0.0031,0.272531,0.591394,0.688946,0.636454,2334


***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.6208858187728565, 'recall': 0.6546700942587832, 'f1-score': 0.6373305526590197, 'support': 2334}, 'micro avg': {'precision': 0.6208858187728565, 'recall': 0.6546700942587832, 'f1-score': 0.6373305526590197, 'support': 2334}, 'macro avg': {'precision': 0.6208858187728565, 'recall': 0.6546700942587832, 'f1-score': 0.6373305526590197, 'support': 2334}, 'weighted avg': {'precision': 0.6208858187728565, 'recall': 0.6546700942587832, 'f1-score': 0.6373305526590197, 'support': 2334}}


Saving model checkpoint to bert_skweak/checkpoint-574
Configuration saved in bert_skweak/checkpoint-574/config.json
Model weights saved in bert_skweak/checkpoint-574/pytorch_model.bin
tokenizer config file saved in bert_skweak/checkpoint-574/tokenizer_config.json
Special tokens file saved in bert_skweak/checkpoint-574/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.5879509111193753, 'recall': 0.6773778920308483, 'f1-score': 0.6295042803105715, 'support': 2334}, 'micro avg': {'precision': 0.5879509111193753, 'recall': 0.6773778920308483, 'f1-score': 0.6295042803105715, 'support': 2334}, 'macro avg': {'precision': 0.5879509111193753, 'recall': 0.6773778920308483, 'f1-score': 0.6295042803105715, 'support': 2334}, 'weighted avg': {'precision': 0.5879509111193753, 'recall': 0.6773778920308483, 'f1-score': 0.6295042803105715, 'support': 2334}}


Saving model checkpoint to bert_skweak/checkpoint-1148
Configuration saved in bert_skweak/checkpoint-1148/config.json
Model weights saved in bert_skweak/checkpoint-1148/pytorch_model.bin
tokenizer config file saved in bert_skweak/checkpoint-1148/tokenizer_config.json
Special tokens file saved in bert_skweak/checkpoint-1148/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'micro avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'macro avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'weighted avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}}


Saving model checkpoint to bert_skweak/checkpoint-1722
Configuration saved in bert_skweak/checkpoint-1722/config.json
Model weights saved in bert_skweak/checkpoint-1722/pytorch_model.bin
tokenizer config file saved in bert_skweak/checkpoint-1722/tokenizer_config.json
Special tokens file saved in bert_skweak/checkpoint-1722/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.59139389481427, 'recall': 0.6889460154241646, 'f1-score': 0.6364535919255888, 'support': 2334}, 'micro avg': {'precision': 0.59139389481427, 'recall': 0.6889460154241646, 'f1-score': 0.6364535919255888, 'support': 2334}, 'macro avg': {'precision': 0.59139389481427, 'recall': 0.6889460154241646, 'f1-score': 0.6364535919255888, 'support': 2334}, 'weighted avg': {'precision': 0.59139389481427, 'recall': 0.6889460154241646, 'f1-score': 0.6364535919255888, 'support': 2334}}


Saving model checkpoint to bert_skweak/checkpoint-2296
Configuration saved in bert_skweak/checkpoint-2296/config.json
Model weights saved in bert_skweak/checkpoint-2296/pytorch_model.bin
tokenizer config file saved in bert_skweak/checkpoint-2296/tokenizer_config.json
Special tokens file saved in bert_skweak/checkpoint-2296/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert_skweak/checkpoint-1722 (score: 0.6398228306825045).
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


Evaluating...


{'ORG': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'micro avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'macro avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}, 'weighted avg': {'precision': 0.6034941131788834, 'recall': 0.6808054841473865, 'f1-score': 0.6398228306825045, 'support': 2334}}


{'eval_loss': 0.23140588402748108,
 'eval_precision': 0.6034941131788834,
 'eval_recall': 0.6808054841473865,
 'eval_f1-score': 0.6398228306825045,
 'eval_support': 2334,
 'eval_runtime': 10.0313,
 'eval_samples_per_second': 122.317,
 'eval_steps_per_second': 7.676,
 'epoch': 4.0}

In [34]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1226
  Batch size = 16


{'ORG': {'precision': 0.6425052779732583, 'recall': 0.6880180859080633, 'f1-score': 0.6644832605531296, 'support': 2654}, 'micro avg': {'precision': 0.6425052779732583, 'recall': 0.6880180859080633, 'f1-score': 0.6644832605531296, 'support': 2654}, 'macro avg': {'precision': 0.6425052779732583, 'recall': 0.6880180859080633, 'f1-score': 0.6644832605531296, 'support': 2654}, 'weighted avg': {'precision': 0.6425052779732583, 'recall': 0.6880180859080633, 'f1-score': 0.6644832605531296, 'support': 2654}}


PredictionOutput(predictions=array([[[ 8.074237  , -3.6468012 , -3.0018382 ],
        [ 7.9196115 , -3.4937391 , -2.8766022 ],
        [ 7.5850224 , -2.76133   , -3.4097521 ],
        ...,
        [ 8.02725   , -3.6919303 , -3.8014567 ],
        [ 7.808679  , -3.5366495 , -3.5520136 ],
        [ 7.858851  , -3.3837903 , -3.7832403 ]],

       [[ 7.583177  , -3.430417  , -3.1408954 ],
        [ 7.094295  , -2.9067159 , -3.3074303 ],
        [ 3.3324177 ,  0.51144004, -2.6907992 ],
        ...,
        [ 6.5760856 , -3.064727  , -3.541926  ],
        [ 7.8078456 , -3.6376245 , -3.9557762 ],
        [ 4.6615334 , -3.2266529 , -0.9084553 ]],

       [[ 8.026039  , -3.7155654 , -2.8322885 ],
        [ 8.750321  , -4.4810057 , -3.1605275 ],
        [ 8.785961  , -4.2658024 , -3.6512444 ],
        ...,
        [ 9.04862   , -3.872757  , -4.1589384 ],
        [ 8.83294   , -3.5992982 , -4.223845  ],
        [ 4.5943255 , -1.4913898 , -1.1867756 ]],

       ...,

       [[ 8.297818  , -3.779044