In [1]:
import datasets, transformers

In [2]:
from datasets import load_dataset

data = load_dataset('xtreme', name='PAN-X.de')

In [3]:
data 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [4]:
from collections import defaultdict


In [5]:
"""
from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
# Return a DatasetDict if a key doesn't exist
panx_ch = defaultdict(DatasetDict) # when defining the default dict, we must define the default value that it returns ... here it returns DatasetDict object
for lang, frac in zip(langs, fracs):
    # Load monolingual corpus
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # Shuffle and downsample each split according to spoken proportion
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))
            
""";


In [6]:
tags = data['train'].features['ner_tags'].feature

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch['ner_tags']]}

data = data.map(create_tag_names)

In [7]:
import pandas as pd
de_example = data['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], index=['tokens','tags'])

Unnamed: 0,0,1,2,3,4,5,6,7
tokens,als,Teil,der,Savoyer,Voralpen,im,Osten,.
tags,O,O,O,B-LOC,I-LOC,O,O,O


In [8]:
# to check if we have imbalanced tokens

from collections import Counter, defaultdict 

split2freqs = defaultdict(Counter)

for split, data_vals in data.items():
    for row in data_vals['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split('-')[-1]
                split2freqs[split][tag_type]+=1
            if tag.startswith('O'):
                split2freqs[split]['O']+=1
pd.DataFrame(split2freqs)

# the named-entities are balanced except for the O type.

Unnamed: 0,train,validation,test
O,137535,69057,68654
LOC,9778,4968,4961
PER,9290,4569,4750
ORG,8575,4281,4157


In [9]:
# tokenizing:

from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = 'xlm-roberta-base'

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [10]:
text = "My name is Omar"

bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
bert_tokens, xlmr_tokens

(['[CLS]', 'My', 'name', 'is', 'Omar', '[SEP]'],
 ['<s>', '▁My', '▁name', '▁is', '▁Omar', '</s>'])

In [11]:
# building the model:

import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        
        # put the tokenClassifier Head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        # load and initialize weights
        self.init_weights()
    
        
        
    def forward(self, input_ids=None, attention_mask=None,
                token_type_ids=None, labels=None, **kwargs):
        
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                                 token_type_ids=token_type_ids, **kwargs)
        
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)
    

In [12]:
XLMRobertaConfig()

XLMRobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.37.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [13]:
index2tag = {idx : name for idx, name in enumerate(tags.names)}
tag2index = {name : idx for idx, name in enumerate(tags.names)}

In [14]:
# load the configuration of the xlmr model to pass it to our custom model

from transformers import AutoConfig
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels= tags.num_classes,
                                         id2label = index2tag,
                                         label2id = tag2index)
xlmr_config

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.37.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

In [15]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

custom_xlmr_model = (XLMRobertaForTokenClassification
                    .from_pretrained(xlmr_model_name,config = xlmr_config)
                    .to(device))


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. `config` (passed to `XLMRobertaForTokenClassification.from_pretrained`):

This argument is optional. By default, `.from_pretrained` will automatically load the configuration of the pre-trained model based on the `xlmr_model_name`. However, in this case, you've created a custom configuration (`xlmr_config`) using `AutoConfig.from_pretrained` and explicitly pass it here. This allows you to override some default parameters defined in the pre-trained model's configuration with your custom values like `num_labels`, `id2label`, and `label2id`.

In [16]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')
xlmr_tokens = xlmr_tokenizer(text).tokens()

pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens','Input IDs'])

Unnamed: 0,0,1,2,3,4,5
Tokens,<s>,▁My,▁name,▁is,▁Omar,</s>
Input IDs,0,2646,9351,83,112493,2


In [17]:
outputs = custom_xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
outputs.shape, predictions.shape

(torch.Size([1, 6, 7]), torch.Size([1, 6]))

In [18]:
preds = [tags.names[i] for i in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=['Tokens', 'Predicted'])

Unnamed: 0,0,1,2,3,4,5
Tokens,<s>,▁My,▁name,▁is,▁Omar,</s>
Predicted,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC


we need to mask some tokens like "row" in sparrow ... to make the model focus on the main word rather than its subword.

Also focus on Content Words: Special tokens like `<s>` (start of sentence) and `</s>` (end of sentence) don't hold meaning by themselves and shouldn't be considered for entity recognition. Masking them forces the model to focus on the actual content words ("Einwohnern" in your example) for identifying named entities.

In [19]:
text = " ".join(de_example['tokens'])
text

'als Teil der Savoyer Voralpen im Osten .'

In [20]:
de_example['tokens'],\
xlmr_tokenizer(text).tokens()

# as we illustrated above, we need to mask the following:
# "<s>", "yer in savoyer" , "al, pen in Voralpen" ... etc


(['als', 'Teil', 'der', 'Savoyer', 'Voralpen', 'im', 'Osten', '.'],
 ['<s>',
  '▁als',
  '▁Teil',
  '▁der',
  '▁Savo',
  'yer',
  '▁Vor',
  'al',
  'pen',
  '▁im',
  '▁O',
  'sten',
  '▁',
  '.',
  '</s>'])

In [21]:
tokenized_inputs = xlmr_tokenizer(de_example['tokens'], truncation=True,
               is_split_into_words=True)
tokenized_inputs.word_ids()

[None, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6, 6, 7, 7, None]

##### Why use is_split_into_words=False? (default is True)

This argument is useful when your input (`de_example['tokens']`) is already pre-processed or tokenized in a specific way that you want to preserve. For example:

>* If `de_example['tokens']` contains pre-defined character n-grams (sequences of n characters), setting is_split_into_words=False ensures the tokenizer treats the entire n-gram as a single token.
>* If `de_example['tokens']` is a sentence where you've already performed sentence-piece tokenization, setting is_split_into_words=False avoids further splitting by the tokenizer.

###### In the code you provided,
> `truncation=True` instructs the tokenizer (`xlmr_tokenizer`) to shorten the input text if it exceeds a certain length

In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
def encode_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])

In [24]:
data_encoded = encode_dataset(data) 

### The following is important library called `seqeval` for performance measure:

> Note: `seqeval` expects the inputs to be list of lists. 

In [25]:
from seqeval.metrics import classification_report
y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

print(classification_report(y_pred, y_true))


              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [26]:
# prepare the data to be list of lists in order to feed it to seqeval later:

import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    preds_list , labels_list = [], []
    
    for batch_idx in range(batch_size):
        example_preds, example_labels = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list



In [27]:
# Training the model:


from transformers import TrainingArguments

num_epochs = 2
batch_size = 16
logging_steps = len(data["train"]) // batch_size
model_name = f"Custom-{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01,
    logging_steps=logging_steps)

In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_pred, y_true)}

In [30]:
# datacollator here padd the sequence with smaller length than input_max_token

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [31]:
def model_init():
    return (XLMRobertaForTokenClassification
           .from_pretrained(xlmr_model_name, config=xlmr_config)
           .to(device))

from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=data_encoded['train'],
                  eval_dataset=data_encoded['validation'],
                  tokenizer=xlmr_tokenizer)
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,F1
1,0.2233,0.145805,0.840516
2,0.1069,0.12432,0.86892


TrainOutput(global_step=2500, training_loss=0.16508057250976563, metrics={'train_runtime': 10986.9578, 'train_samples_per_second': 3.641, 'train_steps_per_second': 0.228, 'total_flos': 837894091031712.0, 'train_loss': 0.16508057250976563, 'epoch': 2.0})

In [48]:
data['train']

Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 20000
})

In [None]:
trainer.push_to_hub(commit_message="Training completed!")


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]