In [6]:
train_path = 'train_hand_fin.txt'
dev_path = 'dev_hand_fin.txt'
test_path = 'test_hand_fin.txt'
model_checkpoint = "xlm-roberta-base"

epoch = 4
batch_size = 16
learning_rate = 5e-5
weight_decay = 0.0

save_folder_name = 'xlm_handdata'
save_model_name = 'xlm_handdata_fin'

In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us

In [3]:
import re
import pandas as pd
from datasets import Dataset

Load dataset

In [7]:
def load_data(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == '\n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split('\t',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [8]:
traindata = load_data(train_path)
test = load_data(test_path)
dev = load_data(dev_path)

traindata.head()

Unnamed: 0,tokens,ner_tags
0,[ABKÜRZUNGSVERZEICHNIS],[O]
1,"[Empfehlung, Nr., 10, Ein, Überblick, der, Akt...","[O, O, O, O, O, O, O, O, O, O, B-ORG, I-ORG, O..."
2,"[Betreutes, Konto, Ist, ein, von, der, Schuldn...","[O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, I-ORG,..."
3,"[Insgesamt, verfügte, die, Rettungsleitstelle,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Empfehlungen, ,, die, in, einem, früheren, Be...","[O, O, O, O, O, O, O, O, B-ORG, I-ORG, O, O, O..."


In [9]:
label2id = {'O': 0,'B-ORG':1,'I-ORG':2}
id2label = {0 :'O', 1:'B-ORG', 2:'I-ORG'}
label_list = ['O','B-ORG','I-ORG']

In [10]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [11]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [13]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('tokens', axis = 1)
  for_bert = for_bert.drop('ner_tags', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [14]:
train_dataset = tokenized_for_bert(traindata)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3678
})


In [15]:
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1227
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1226
})


In [16]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=2f00d30f3dd44f53a3b31613fff8412f1a165c54981ff73ad59cd1dbd09ed531
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [17]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report

In [18]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']

In [20]:
batch_size = 16
learning_rate = 5e-5
epoch = 4
weight_decay = 0.1

args = TrainingArguments(
    save_folder_name,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay = weight_decay,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3678
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 920
  Number of trainable parameters = 277455363
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,0.0992,0.057003,0.719056,0.831838,0.771346,2563
2,0.0463,0.050536,0.762794,0.86071,0.808799,2563
3,0.034,0.052732,0.742385,0.893874,0.811117,2563
4,0.0242,0.047498,0.793347,0.874756,0.832065,2563


***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.7190556492411467, 'recall': 0.8318376902067889, 'f1-score': 0.7713458755426917, 'support': 2563}, 'micro avg': {'precision': 0.7190556492411467, 'recall': 0.8318376902067889, 'f1-score': 0.7713458755426917, 'support': 2563}, 'macro avg': {'precision': 0.7190556492411467, 'recall': 0.8318376902067889, 'f1-score': 0.7713458755426917, 'support': 2563}, 'weighted avg': {'precision': 0.7190556492411467, 'recall': 0.8318376902067889, 'f1-score': 0.7713458755426917, 'support': 2563}}


Saving model checkpoint to xlm_handdata/checkpoint-230
Configuration saved in xlm_handdata/checkpoint-230/config.json
Model weights saved in xlm_handdata/checkpoint-230/pytorch_model.bin
tokenizer config file saved in xlm_handdata/checkpoint-230/tokenizer_config.json
Special tokens file saved in xlm_handdata/checkpoint-230/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.7627939142461964, 'recall': 0.8607101053452985, 'f1-score': 0.8087992667277727, 'support': 2563}, 'micro avg': {'precision': 0.7627939142461964, 'recall': 0.8607101053452985, 'f1-score': 0.8087992667277727, 'support': 2563}, 'macro avg': {'precision': 0.7627939142461964, 'recall': 0.8607101053452985, 'f1-score': 0.8087992667277727, 'support': 2563}, 'weighted avg': {'precision': 0.7627939142461964, 'recall': 0.8607101053452985, 'f1-score': 0.8087992667277727, 'support': 2563}}


Saving model checkpoint to xlm_handdata/checkpoint-460
Configuration saved in xlm_handdata/checkpoint-460/config.json
Model weights saved in xlm_handdata/checkpoint-460/pytorch_model.bin
tokenizer config file saved in xlm_handdata/checkpoint-460/tokenizer_config.json
Special tokens file saved in xlm_handdata/checkpoint-460/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.7423849643551523, 'recall': 0.8938743659773702, 'f1-score': 0.8111170118605063, 'support': 2563}, 'micro avg': {'precision': 0.7423849643551523, 'recall': 0.8938743659773702, 'f1-score': 0.8111170118605063, 'support': 2563}, 'macro avg': {'precision': 0.7423849643551523, 'recall': 0.8938743659773702, 'f1-score': 0.8111170118605063, 'support': 2563}, 'weighted avg': {'precision': 0.7423849643551523, 'recall': 0.8938743659773702, 'f1-score': 0.8111170118605062, 'support': 2563}}


Saving model checkpoint to xlm_handdata/checkpoint-690
Configuration saved in xlm_handdata/checkpoint-690/config.json
Model weights saved in xlm_handdata/checkpoint-690/pytorch_model.bin
tokenizer config file saved in xlm_handdata/checkpoint-690/tokenizer_config.json
Special tokens file saved in xlm_handdata/checkpoint-690/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


{'ORG': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'micro avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'macro avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'weighted avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}}


Saving model checkpoint to xlm_handdata/checkpoint-920
Configuration saved in xlm_handdata/checkpoint-920/config.json
Model weights saved in xlm_handdata/checkpoint-920/pytorch_model.bin
tokenizer config file saved in xlm_handdata/checkpoint-920/tokenizer_config.json
Special tokens file saved in xlm_handdata/checkpoint-920/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from xlm_handdata/checkpoint-920 (score: 0.8320653182408609).
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 16


Evaluating...


{'ORG': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'micro avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'macro avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}, 'weighted avg': {'precision': 0.7933474876150035, 'recall': 0.8747561451424113, 'f1-score': 0.8320653182408609, 'support': 2563}}


{'eval_loss': 0.04749751836061478,
 'eval_precision': 0.7933474876150035,
 'eval_recall': 0.8747561451424113,
 'eval_f1-score': 0.8320653182408609,
 'eval_support': 2563,
 'eval_runtime': 9.8751,
 'eval_samples_per_second': 124.252,
 'eval_steps_per_second': 7.797,
 'epoch': 4.0}

In [21]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1226
  Batch size = 16


{'ORG': {'precision': 0.8161652993011242, 'recall': 0.899531145344943, 'f1-score': 0.8558228453082682, 'support': 2986}, 'micro avg': {'precision': 0.8161652993011242, 'recall': 0.899531145344943, 'f1-score': 0.8558228453082682, 'support': 2986}, 'macro avg': {'precision': 0.8161652993011242, 'recall': 0.899531145344943, 'f1-score': 0.8558228453082682, 'support': 2986}, 'weighted avg': {'precision': 0.8161652993011242, 'recall': 0.899531145344943, 'f1-score': 0.8558228453082682, 'support': 2986}}


PredictionOutput(predictions=array([[[ 2.159741  , -1.5088053 , -1.4643275 ],
        [ 7.374598  , -3.0770729 , -3.3982024 ],
        [ 7.43936   , -2.6871061 , -4.0631886 ],
        ...,
        [ 2.2168741 , -1.6025752 , -1.4707897 ],
        [ 2.2168741 , -1.6025752 , -1.4707897 ],
        [ 2.2168741 , -1.6025752 , -1.4707897 ]],

       [[ 2.2446141 , -1.1788807 , -1.8513404 ],
        [ 7.5165505 , -2.5778208 , -4.1041303 ],
        [-0.6622845 ,  4.4080653 , -3.2436173 ],
        ...,
        [ 1.6935023 ,  0.48925114, -0.31005773],
        [ 1.6935023 ,  0.48925114, -0.31005773],
        [ 1.6935023 ,  0.48925114, -0.31005773]],

       [[ 2.9861934 , -1.5706313 , -2.0955298 ],
        [ 7.1965    , -3.2472298 , -3.5504544 ],
        [ 7.0975633 , -2.9308023 , -3.5448513 ],
        ...,
        [ 7.1261215 , -3.0724607 , -3.4692984 ],
        [ 7.419703  , -3.22181   , -3.0791702 ],
        [ 2.9749155 , -1.5993192 , -2.0804973 ]],

       ...,

       [[ 2.4630885 , -0.775236

In [22]:
trainer.save_model(save_folder_name)
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [23]:
from transformers import AutoModel

model_trained = AutoModel.from_pretrained(save_folder_name)
model_trained.push_to_hub(save_model_name)

tokenizer_trained = AutoTokenizer.from_pretrained(save_folder_name)
tokenizer_trained.push_to_hub(save_model_name)

loading configuration file xlm_handdata/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm_handdata",
  "architectures": [
    "XLMRobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ORG",
    "2": "I-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-ORG": 1,
    "I-ORG": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm_handdata/pytorch_model.bin
Some weights of t

CommitInfo(commit_url='https://huggingface.co/Pacho/xlm_handdata_fin/commit/ba0ea42afa90cb77a58861a063ce9a228b2641a0', commit_message='Upload tokenizer', commit_description='', oid='ba0ea42afa90cb77a58861a063ce9a228b2641a0', pr_url=None, pr_revision=None, pr_num=None)