In [1]:

train_path = 'skweak_labelled_20_all.txt'

dev_path = 'dev_hand_fin.txt'
test_path = 'test_hand_fin.txt'

model_checkpoint = "xlm-roberta-base"

batch_size = 32
learning_rate = 0.00002
epoch = 2
weight_decay = 0.3

save_folder_name = 'xlm_alldata'
save_model_name = 'xlm_alldata_fin'

In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://u

In [3]:
import re
import pandas as pd
from datasets import Dataset

Load dataset

In [4]:
def load_data_weak(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == ' \t \n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split('\t',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [5]:
def load_data(path):
  file = open(path, 'r')
  lines = file.readlines()

  tokens = []
  ner_tags = []

  temp_toks = []
  temp_tags = []

  for i in lines:
    if i == '\n':
      tokens.append(temp_toks)
      ner_tags.append(temp_tags)

      temp_toks = []
      temp_tags = []

    else:
      temp = re.split('\t',i)
      temp_toks.append(temp[0])
      temp_tags.append(re.sub('\n','',temp[-1]))
  
  data = pd.DataFrame({'tokens':tokens,
                       'ner_tags':ner_tags})
  return data


In [6]:
traindata = load_data_weak(train_path)
test = load_data(test_path)
dev = load_data(dev_path)

traindata.head()

Unnamed: 0,tokens,ner_tags
0,"[Maßnahmenbekanntgabe, zu, MA, 37, ,, Prüfung,...","[O, O, B-ORG, I-ORG, O, O, O, O]"
1,[INHALTSVERZEICHNIS],[O]
2,[ABKÜRZUNGSVERZEICHNIS],[O]
3,"[bzw., beziehungsweise, Nr., Nummer]","[O, O, O, O]"
4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, B..."


In [7]:
label2id = {'O': 0,'B-ORG':1,'I-ORG':2}
id2label = {0 :'O', 1:'B-ORG', 2:'I-ORG'}
label_list = ['O','B-ORG','I-ORG']

In [8]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [9]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [11]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  # for_bert['token_type_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      # for_bert.at[index,'token_type_ids'] = inputs['token_type_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('tokens', axis = 1)
  for_bert = for_bert.drop('ner_tags', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [12]:
train_dataset = tokenized_for_bert(traindata)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10178
})


In [13]:
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1227
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1226
})


In [14]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=382b8de463413fa411a00eea9bfc1d25e57f1139f156d221aa90fce234605aba
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [15]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from seqeval.metrics import classification_report

In [16]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    print(result)
    
    return result['ORG']

In [17]:
args = TrainingArguments(
    f"xlm-ner",
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay = weight_decay,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

***** Running training *****
  Num examples = 10178
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 638
  Number of trainable parameters = 277455363
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
1,0.0741,0.159208,0.613621,0.72064,0.662839,2563
2,0.0197,0.18079,0.620374,0.712837,0.663399,2563


***** Running Evaluation *****
  Num examples = 1227
  Batch size = 32


{'ORG': {'precision': 0.6136212624584717, 'recall': 0.7206398751463129, 'f1-score': 0.6628386865243137, 'support': 2563}, 'micro avg': {'precision': 0.6136212624584717, 'recall': 0.7206398751463129, 'f1-score': 0.6628386865243137, 'support': 2563}, 'macro avg': {'precision': 0.6136212624584717, 'recall': 0.7206398751463129, 'f1-score': 0.6628386865243137, 'support': 2563}, 'weighted avg': {'precision': 0.6136212624584717, 'recall': 0.7206398751463129, 'f1-score': 0.6628386865243137, 'support': 2563}}


Saving model checkpoint to xlm-ner/checkpoint-319
Configuration saved in xlm-ner/checkpoint-319/config.json
Model weights saved in xlm-ner/checkpoint-319/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-319/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-319/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 32


{'ORG': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'micro avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'macro avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'weighted avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}}


Saving model checkpoint to xlm-ner/checkpoint-638
Configuration saved in xlm-ner/checkpoint-638/config.json
Model weights saved in xlm-ner/checkpoint-638/pytorch_model.bin
tokenizer config file saved in xlm-ner/checkpoint-638/tokenizer_config.json
Special tokens file saved in xlm-ner/checkpoint-638/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from xlm-ner/checkpoint-638 (score: 0.6633986928104576).
***** Running Evaluation *****
  Num examples = 1227
  Batch size = 32


Evaluating...


{'ORG': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'micro avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'macro avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}, 'weighted avg': {'precision': 0.6203735144312393, 'recall': 0.7128365197034725, 'f1-score': 0.6633986928104576, 'support': 2563}}


{'eval_loss': 0.1807904839515686,
 'eval_precision': 0.6203735144312393,
 'eval_recall': 0.7128365197034725,
 'eval_f1-score': 0.6633986928104576,
 'eval_support': 2563,
 'eval_runtime': 9.0772,
 'eval_samples_per_second': 135.174,
 'eval_steps_per_second': 4.296,
 'epoch': 2.0}

In [18]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1226
  Batch size = 32


{'ORG': {'precision': 0.6710077519379845, 'recall': 0.724715338245144, 'f1-score': 0.6968282080180326, 'support': 2986}, 'micro avg': {'precision': 0.6710077519379845, 'recall': 0.724715338245144, 'f1-score': 0.6968282080180326, 'support': 2986}, 'macro avg': {'precision': 0.6710077519379845, 'recall': 0.724715338245144, 'f1-score': 0.6968282080180326, 'support': 2986}, 'weighted avg': {'precision': 0.6710077519379845, 'recall': 0.724715338245144, 'f1-score': 0.6968282080180326, 'support': 2986}}


PredictionOutput(predictions=array([[[ 0.67511696, -0.5009711 , -0.46437895],
        [ 6.5152907 , -3.7528322 , -2.764819  ],
        [ 6.2750835 , -3.4611936 , -2.6092248 ],
        ...,
        [ 1.0468543 , -1.1551912 , -0.7291689 ],
        [ 1.0468543 , -1.1551912 , -0.7291689 ],
        [ 1.0468543 , -1.1551912 , -0.7291689 ]],

       [[ 1.1194434 , -0.5603515 , -0.7848928 ],
        [ 6.4021363 , -3.790402  , -2.763958  ],
        [ 1.4067718 ,  0.58311486, -3.1844864 ],
        ...,
        [ 1.0042442 , -0.71520054, -0.6960957 ],
        [ 1.0042442 , -0.71520054, -0.6960957 ],
        [ 1.0042442 , -0.71520054, -0.6960957 ]],

       [[ 2.6409874 , -0.8373926 , -1.4373262 ],
        [ 6.4441977 , -3.41468   , -2.8110123 ],
        [ 6.621584  , -3.7694957 , -3.145807  ],
        ...,
        [ 6.8595076 , -3.8272142 , -2.8958535 ],
        [ 6.692259  , -3.8089647 , -2.7114413 ],
        [ 3.0567408 , -1.0118529 , -1.6197305 ]],

       ...,

       [[ 0.5953423 , -0.374631