In [4]:
train_path = 'deu.train.utf.txt'
dev_path = 'deu.testa.utf.txt'
test_path = 'deu.testb.utf.txt'
model_checkpoint = "xlm-roberta-base"

epoch = 4
batch_size = 32
learning_rate = 5e-5
weight_decay = 0.2

save_folder_name = 'xlm_conll'

In [5]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.8.0-py3-none-any.whl (452 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py38-none-any.whl (132 kB)
Collecting xxhash
  Using cached xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
Installing collected packages: xxhash, urllib3, multiprocess, responses, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed datasets-2.8.0 multiprocess-0.70.14 responses-0.18.0 urllib3-1.26.13 xxhash-3.2.0


In [6]:
import re
import pandas as pd
from datasets import Dataset

Load dataset

In [7]:
def load_data(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == '\n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split(' ',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [8]:
traindata = load_data(train_path)
test = load_data(test_path)
dev = load_data(dev_path)

traindata.head()

Unnamed: 0,tokens,ner_tags
0,[-DOCSTART-],[O]
1,"[Ereignis, und, Erzählung, oder, :]","[O, O, O, O, O]"
2,"[Schwierigkeiten, beim, nachvollziehenden, Ver...","[O, O, O, O, I-PER, I-PER, O, O, O, O, O, I-LO..."
3,"[Diskussionen, über, Asylbewerber-, ,, Aussied...","[O, O, O, O, O, O, O, O, O, I-LOC, O, O, O, O,..."
4,"[Es, ist, deswegen, momentan, nicht, schwer, ,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [9]:
tags_train = [x for x in traindata['ner_tags']]
tags_dev = [x for x in dev['ner_tags']]
tags_test = [x for x in test['ner_tags']]
tags = tags_train + tags_dev + tags_test
label_list = list(set([item for sublist in tags for item in sublist]))
label_list

['I-LOC', 'I-MISC', 'B-LOC', 'B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'O', 'I-ORG']

In [10]:
label2id = {}
id2label= {}
count = 0
for i in label_list:
  label2id.update({i:count})
  id2label.update({count:i})
  count += 1
label2id

{'I-LOC': 0,
 'I-MISC': 1,
 'B-LOC': 2,
 'B-ORG': 3,
 'B-MISC': 4,
 'B-PER': 5,
 'I-PER': 6,
 'O': 7,
 'I-ORG': 8}

In [11]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [12]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [14]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  # for_bert['token_type_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      # for_bert.at[index,'token_type_ids'] = inputs['token_type_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('tokens', axis = 1)
  for_bert = for_bert.drop('ner_tags', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [15]:
train_dataset = tokenized_for_bert(traindata)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12705
})


In [16]:
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3068
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3160
})


In [17]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=b737a4222556f0bcaab81f18bcf43eadf2f823882590e4f61da5f5e821b185b0
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [18]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report

In [22]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']

In [20]:

args = TrainingArguments(
    f"xlm-ner",
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay = weight_decay,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

***** Running training *****
  Num examples = 12705
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1592
  Number of trainable parameters = 277459977
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,0.1741,0.127786,0.751377,0.803872,0.776739,1188
2,0.0704,0.124683,0.814564,0.809764,0.812157,1188
3,0.0421,0.134151,0.835106,0.792929,0.813472,1188
4,0.0246,0.146626,0.843777,0.827441,0.835529,1188


***** Running Evaluation *****
  Num examples = 3068
  Batch size = 32


{'LOC': {'precision': 0.8477477477477477, 'recall': 0.7900923593618808, 'f1-score': 0.8179052585832247, 'support': 1191}, 'MISC': {'precision': 0.717, 'recall': 0.6719775070290535, 'f1-score': 0.6937590711175616, 'support': 1067}, 'ORG': {'precision': 0.7513768686073957, 'recall': 0.8038720538720538, 'f1-score': 0.7767385115900772, 'support': 1188}, 'PER': {'precision': 0.9266521423384169, 'recall': 0.909479686386315, 'f1-score': 0.9179856115107914, 'support': 1403}, 'micro avg': {'precision': 0.8173602353930223, 'recall': 0.8020210352650031, 'f1-score': 0.8096179868845632, 'support': 4849}, 'macro avg': {'precision': 0.8106941896733901, 'recall': 0.7938554016623258, 'f1-score': 0.8015971132004138, 'support': 4849}, 'weighted avg': {'precision': 0.8181965855174165, 'recall': 0.8020210352650031, 'f1-score': 0.8094587041810086, 'support': 4849}}


Saving model checkpoint to xlm-ner/checkpoint-398
Configuration saved in xlm-ner/checkpoint-398/config.json
Model weights saved in xlm-ner/checkpoint-398/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-398/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-398/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 32


{'LOC': {'precision': 0.8283157038242474, 'recall': 0.8547439126784215, 'f1-score': 0.8413223140495867, 'support': 1191}, 'MISC': {'precision': 0.7795358649789029, 'recall': 0.6925960637300843, 'f1-score': 0.7334987593052109, 'support': 1067}, 'ORG': {'precision': 0.8145639288738358, 'recall': 0.8097643097643098, 'f1-score': 0.8121570282819756, 'support': 1188}, 'PER': {'precision': 0.9517980107115531, 'recall': 0.8866714183891661, 'f1-score': 0.918081180811808, 'support': 1403}, 'micro avg': {'precision': 0.8495176848874598, 'recall': 0.8172819137966592, 'f1-score': 0.8330880807231449, 'support': 4849}, 'macro avg': {'precision': 0.8435533770971347, 'recall': 0.8109439261404955, 'f1-score': 0.8262648206121453, 'support': 4849}, 'weighted avg': {'precision': 0.8499408801232406, 'recall': 0.8172819137966592, 'f1-score': 0.832660032685022, 'support': 4849}}


Saving model checkpoint to xlm-ner/checkpoint-796
Configuration saved in xlm-ner/checkpoint-796/config.json
Model weights saved in xlm-ner/checkpoint-796/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-796/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-796/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 32


{'LOC': {'precision': 0.8554522400676247, 'recall': 0.8497061293031066, 'f1-score': 0.8525695029486099, 'support': 1191}, 'MISC': {'precision': 0.7628458498023716, 'recall': 0.7235238987816307, 'f1-score': 0.7426647426647427, 'support': 1067}, 'ORG': {'precision': 0.8351063829787234, 'recall': 0.7929292929292929, 'f1-score': 0.8134715025906736, 'support': 1188}, 'PER': {'precision': 0.9427121102248006, 'recall': 0.9265858873841768, 'f1-score': 0.9345794392523366, 'support': 1403}, 'micro avg': {'precision': 0.8562313908974905, 'recall': 0.8302742833573933, 'f1-score': 0.8430530834467594, 'support': 4849}, 'macro avg': {'precision': 0.8490291457683801, 'recall': 0.8231863020995518, 'f1-score': 0.8358212968640907, 'support': 4849}, 'weighted avg': {'precision': 0.8553375156287463, 'recall': 0.8302742833573933, 'f1-score': 0.8425350911082332, 'support': 4849}}


Saving model checkpoint to xlm-ner/checkpoint-1194
Configuration saved in xlm-ner/checkpoint-1194/config.json
Model weights saved in xlm-ner/checkpoint-1194/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-1194/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-1194/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 32


{'LOC': {'precision': 0.8591549295774648, 'recall': 0.8706968933669186, 'f1-score': 0.8648874061718098, 'support': 1191}, 'MISC': {'precision': 0.7899686520376176, 'recall': 0.7085285848172446, 'f1-score': 0.7470355731225296, 'support': 1067}, 'ORG': {'precision': 0.8437768240343347, 'recall': 0.8274410774410774, 'f1-score': 0.8355291117722058, 'support': 1188}, 'PER': {'precision': 0.9381368267831149, 'recall': 0.9187455452601568, 'f1-score': 0.9283399351818509, 'support': 1403}, 'micro avg': {'precision': 0.8643419094195195, 'recall': 0.8383171787997525, 'f1-score': 0.8511306532663317, 'support': 4849}, 'macro avg': {'precision': 0.857759308108133, 'recall': 0.8313530252213494, 'f1-score': 0.843948006562099, 'support': 4849}, 'weighted avg': {'precision': 0.8630156543164361, 'recall': 0.8383171787997525, 'f1-score': 0.8501211324227431, 'support': 4849}}


Saving model checkpoint to xlm-ner/checkpoint-1592
Configuration saved in xlm-ner/checkpoint-1592/config.json
Model weights saved in xlm-ner/checkpoint-1592/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-1592/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-1592/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from xlm-ner/checkpoint-1592 (score: 0.8355291117722058).
***** Running Evaluation *****
  Num examples = 3068
  Batch size = 32


Evaluating...


{'LOC': {'precision': 0.8591549295774648, 'recall': 0.8706968933669186, 'f1-score': 0.8648874061718098, 'support': 1191}, 'MISC': {'precision': 0.7899686520376176, 'recall': 0.7085285848172446, 'f1-score': 0.7470355731225296, 'support': 1067}, 'ORG': {'precision': 0.8437768240343347, 'recall': 0.8274410774410774, 'f1-score': 0.8355291117722058, 'support': 1188}, 'PER': {'precision': 0.9381368267831149, 'recall': 0.9187455452601568, 'f1-score': 0.9283399351818509, 'support': 1403}, 'micro avg': {'precision': 0.8643419094195195, 'recall': 0.8383171787997525, 'f1-score': 0.8511306532663317, 'support': 4849}, 'macro avg': {'precision': 0.857759308108133, 'recall': 0.8313530252213494, 'f1-score': 0.843948006562099, 'support': 4849}, 'weighted avg': {'precision': 0.8630156543164361, 'recall': 0.8383171787997525, 'f1-score': 0.8501211324227431, 'support': 4849}}


{'eval_loss': 0.14662572741508484,
 'eval_precision': 0.8437768240343347,
 'eval_recall': 0.8274410774410774,
 'eval_f1-score': 0.8355291117722058,
 'eval_support': 1188,
 'eval_runtime': 20.9974,
 'eval_samples_per_second': 146.113,
 'eval_steps_per_second': 4.572,
 'epoch': 4.0}

In [23]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 3160
  Batch size = 32


{'LOC': {'precision': 0.8397626112759644, 'recall': 0.8218780251694094, 'f1-score': 0.8307240704500978, 'support': 1033}, 'MISC': {'precision': 0.75, 'recall': 0.6864406779661016, 'f1-score': 0.7168141592920354, 'support': 708}, 'ORG': {'precision': 0.75, 'recall': 0.7218628719275549, 'f1-score': 0.7356624917600528, 'support': 773}, 'PER': {'precision': 0.9230132450331126, 'recall': 0.9346186085498742, 'f1-score': 0.9287796751353602, 'support': 1193}, 'micro avg': {'precision': 0.8330102464691221, 'recall': 0.8114378203398975, 'f1-score': 0.8220825362120798, 'support': 3707}, 'macro avg': {'precision': 0.8156939640772692, 'recall': 0.7912000459032351, 'f1-score': 0.8029950991593865, 'support': 3707}, 'weighted avg': {'precision': 0.8306931693478755, 'recall': 0.8114378203398975, 'f1-score': 0.8207023598922896, 'support': 3707}}


PredictionOutput(predictions=array([[[ 0.73928005,  0.17507993, -2.1409671 , ..., -0.75855327,
          5.584591  , -1.0954525 ],
        [-1.0639623 , -1.1147895 , -2.3909903 , ..., -0.81149507,
          9.927893  , -1.6635995 ],
        [-1.1300712 , -0.9387541 , -2.4725218 , ..., -0.97768605,
          9.810606  , -1.6785362 ],
        ...,
        [ 0.845791  ,  0.24014622, -1.8566028 , ..., -0.75918406,
          5.048185  , -0.9934535 ],
        [ 0.845791  ,  0.24014622, -1.8566028 , ..., -0.75918406,
          5.048185  , -0.9934535 ],
        [ 0.845791  ,  0.24014622, -1.8566028 , ..., -0.75918406,
          5.048185  , -0.9934535 ]],

       [[ 1.0349097 , -0.4071042 , -2.283815  , ..., -0.34858763,
          5.2560096 , -0.5274501 ],
        [ 1.3449433 , -1.40646   , -3.0685349 , ..., -0.41854358,
          7.5041237 ,  0.32704538],
        [-0.3669902 , -1.1532416 , -3.140769  , ..., -0.3178445 ,
          9.282457  , -0.66589975],
        ...,
        [ 1.0567    , -0.