In [1]:
!pip install datasets
from datasets import get_dataset_config_names



In [2]:
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"Xtreme has {len(xtreme_subsets)} configurations")

Xtreme has 183 configurations


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
panx_subsets[:3]
# suffix has ISO 639-1 language code

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [4]:
# language code for german - PAN-X.de
from datasets import load_dataset
load_dataset("xtreme", name='PAN-X.de')

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [5]:
from collections import defaultdict
from datasets import DatasetDict

In [6]:
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]

In [7]:
# Return a DatasetDict if a key doesnt exist
panx_ch = defaultdict(DatasetDict)

In [8]:
panx_ch

defaultdict(datasets.dataset_dict.DatasetDict, {})

In [9]:
for lang, frac in zip(langs, fracs):
  # Load multilingual corpus
  ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
  # Shuffle and downsample each split according to spoken proportion
  for split in ds:
    panx_ch[lang][split] = (
        ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows)))
    )

In [10]:
panx_ch

defaultdict(datasets.dataset_dict.DatasetDict,
            {'de': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 12580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
             }),
             'fr': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 4580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 2290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'la

In [11]:
import pandas as pd

In [12]:
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs}, index=["Number of Training examples"])

Unnamed: 0,de,fr,it,en
Number of Training examples,12580,4580,1680,1180


In [13]:
element = panx_ch["de"]["train"][0]
element

{'tokens': ['2.000',
  'Einwohnern',
  'an',
  'der',
  'Danziger',
  'Bucht',
  'in',
  'der',
  'polnischen',
  'Woiwodschaft',
  'Pommern',
  '.'],
 'ner_tags': [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0],
 'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de']}

In [14]:
for key, val in element.items():
  print(f"{key}: {val}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [15]:
panx_ch["de"]["train"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None),
 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [16]:
# decoding the ner_tags
for key, val in panx_ch["de"]["train"].features.items():
  print(f"{key}: {val}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [17]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [18]:
def create_tag_names(batch):
  return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [19]:
panx_de = panx_ch["de"].map(create_tag_names)

In [20]:
panx_de.items()

dict_items([('train', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 12580
})), ('validation', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
})), ('test', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 6290
}))])

In [21]:
panx_de['train'].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None),
 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags_str': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [22]:
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [23]:
from collections import Counter

In [24]:
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type]+=1

pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


In [25]:
!pip install transformers



In [26]:
from transformers import AutoTokenizer

In [27]:
bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [28]:
text = "Tonit Kumar loves New York!"
bert_tokens = bert_tokenizer(text)
xlmr_tokens = xlmr_tokenizer(text)

In [29]:
bert_tokens, xlmr_tokens

({'input_ids': [101, 17752, 1204, 9392, 7871, 1203, 1365, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 98765, 18, 18385, 5161, 7, 2356, 5753, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [30]:
bert_tokens.tokens(), xlmr_tokens.tokens()

(['[CLS]', 'Toni', '##t', 'Kumar', 'loves', 'New', 'York', '!', '[SEP]'],
 ['<s>', '▁Toni', 't', '▁Kumar', '▁love', 's', '▁New', '▁York', '!', '</s>'])

In [31]:
pd.DataFrame([bert_tokens.tokens(), bert_tokens["input_ids"], xlmr_tokens.tokens(), xlmr_tokens["input_ids"]],
             index=["bert tokens", "bert ids", "xlmr tokens", "xlmr ids"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
bert tokens,[CLS],Toni,##t,Kumar,loves,New,York,!,[SEP],
bert ids,101,17752,1204,9392,7871,1203,1365,106,102,
xlmr tokens,<s>,▁Toni,t,▁Kumar,▁love,s,▁New,▁York,!,</s>
xlmr ids,0,98765,18,18385,5161,7,2356,5753,38,2


In [32]:
"".join(xlmr_tokens.tokens()).replace("\u2581", " ")

'<s> Tonit Kumar loves New York!</s>'

# Creating a custom model for Token Classification

In [33]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [34]:
print(XLMRobertaConfig)

<class 'transformers.models.xlm_roberta.configuration_xlm_roberta.XLMRobertaConfig'>


In [35]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig
  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels

    # load model body
    self.roberta = RobertaModel(config, add_pooling_layer=False)

    # set up token classification head
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # load and initialize weights
    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):

    # use model body to get encoder representation
    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)

    # apply classifier to encoder representation
    sequence_output = self.dropout(outputs[0])
    logits = self.classifier(sequence_output)

    # calculate loss
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    # return model output object
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [36]:
tags, tags.num_classes

(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None),
 7)

In [37]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [38]:
from transformers import AutoConfig

In [39]:
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

In [40]:
import torch

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
input_ids

tensor([[    0, 98765,    18, 18385,  5161,     7,  2356,  5753,    38,     2]])

In [44]:
xlmr_tokens.tokens()

['<s>', '▁Toni', 't', '▁Kumar', '▁love', 's', '▁New', '▁York', '!', '</s>']

In [45]:
pd.DataFrame([xlmr_tokens.tokens(), input_ids[0].numpy()], index=["tokens", "input_ids"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,<s>,▁Toni,t,▁Kumar,▁love,s,▁New,▁York,!,</s>
input_ids,0,98765,18,18385,5161,7,2356,5753,38,2


In [46]:
outputs = xlmr_model(input_ids.to(device))
outputs, outputs[0]

(TokenClassifierOutput(loss=None, logits=tensor([[[ 0.2916,  0.2158,  0.2807,  0.7204, -0.2895,  0.4300,  0.7322],
          [ 0.1945,  0.2635,  0.3099,  0.7640, -0.5781,  0.1128,  0.9446],
          [ 0.2377,  0.4244,  0.1622,  0.7212, -0.5094,  0.1688,  0.8788],
          [ 0.2692,  0.4638,  0.2209,  0.6643, -0.4945,  0.1523,  0.9933],
          [ 0.1811,  0.3856,  0.1555,  0.7328, -0.4270,  0.0585,  0.9718],
          [ 0.0753,  0.3269,  0.1844,  0.6361, -0.3319,  0.1697,  0.8784],
          [ 0.1248,  0.4812,  0.1662,  0.7421, -0.5157,  0.1397,  1.0478],
          [ 0.2697,  0.4948,  0.2785,  0.7395, -0.5774,  0.1537,  1.0188],
          [ 0.2632,  0.3823,  0.1873,  0.6287, -0.4755,  0.1964,  0.9319],
          [ 0.2870,  0.2219,  0.3261,  0.7777, -0.3044,  0.4261,  0.7143]]],
        device='cuda:0', grad_fn=<ViewBackward0>), hidden_states=None, attentions=None),
 tensor([[[ 0.2916,  0.2158,  0.2807,  0.7204, -0.2895,  0.4300,  0.7322],
          [ 0.1945,  0.2635,  0.3099,  0.764

In [47]:
outputs = outputs.logits
predictions = torch.argmax(outputs, dim=-1)
print(predictions)
print(f"Number of tokens in sequence: {len(xlmr_tokens.tokens())}")
print(f"Shape of outputs: {outputs.size()}")

tensor([[6, 6, 6, 6, 6, 6, 6, 6, 6, 3]], device='cuda:0')
Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


In [48]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens.tokens(), preds], index=["tokens", "predictions"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,<s>,▁Toni,t,▁Kumar,▁love,s,▁New,▁York,!,</s>
predictions,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,I-LOC,B-ORG


In [91]:
def tag_text(text, tags, model, tokenizer):
  # Get tokens with special characters
  tokens = tokenizer(text).tokens()

  # encode the sequence into ids
  input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)

  # get predictions over 7 classes
  outputs = model(input_ids)[0]

  # take argmax to get most likely class per token
  predictions = torch.argmax(outputs, dim=-1)

  # convert to dataframe
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]

  return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

# Tokenizing text for NER

In [50]:
words, labels = de_example["tokens"], de_example["ner_tags"]
words, labels

(['2.000',
  'Einwohnern',
  'an',
  'der',
  'Danziger',
  'Bucht',
  'in',
  'der',
  'polnischen',
  'Woiwodschaft',
  'Pommern',
  '.'],
 [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0])

In [51]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [52]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [53]:
# Handling subwords
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
  if word_idx is None or word_idx==previous_word_idx:
    label_ids.append(-100)
  elif word_idx != previous_word_idx:
    label_ids.append(labels[word_idx])
  previous_word_idx = word_idx

In [54]:
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word_ids", "Label_ids", "Labels"]

In [55]:
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word_ids,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label_ids,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [56]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
  labels = []
  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx==previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx

    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [57]:
def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=["langs", "ner_tags", "tokens"])

In [58]:
panx_de_encoded = encode_panx_dataset(panx_ch["de"])

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

In [59]:
panx_de_encoded

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
})

# Performance Measure

In [60]:
!pip install seqeval



In [61]:
# making our data into seqeval accepted format -> [[], [],...]
import numpy as np

In [80]:
def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], []

  for batch_idx in range(batch_size):
    example_labels, example_preds = [], []
    for seq_idx in range(seq_len):
      # ignore the label IDs -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx, seq_idx]])
        example_preds.append(index2tag[preds[batch_idx, seq_idx]])

    labels_list.append(example_labels)
    preds_list.append(example_preds)

  return preds_list, labels_list


# Fine Tune XLM-RoBERTa

In [63]:
!pip install transformers[torch]



In [64]:
from transformers import TrainingArguments

In [65]:
num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=True
)

In [66]:
from huggingface_hub import notebook_login

In [67]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [68]:
from seqeval.metrics import f1_score

In [76]:
def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions,
                                     eval_pred.label_ids)

  return {"f1": f1_score(y_true, y_pred)}

In [70]:
from transformers import DataCollatorForTokenClassification

In [71]:
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [72]:
def model_init():
  return (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

In [73]:
from transformers import Trainer

In [82]:
trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

In [83]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2572,0.153766,0.818694
2,0.1233,0.147516,0.849195
3,0.0796,0.140897,0.860649


TrainOutput(global_step=1575, training_loss=0.15312925922492193, metrics={'train_runtime': 166.9858, 'train_samples_per_second': 226.007, 'train_steps_per_second': 9.432, 'total_flos': 864249509940432.0, 'train_loss': 0.15312925922492193, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [99]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
text = "Ronny is going to Patiala to study at Thapar University"

In [101]:
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,<s>,▁Jeff,▁De,an,▁ist,▁ein,▁Informati,ker,▁bei,▁Google,▁in,▁Kaliforni,en,</s>
Tags,O,B-PER,I-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC,I-LOC,O


In [100]:
tag_text(text, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Tokens,<s>,▁Ron,ny,▁is,▁going,▁to,▁Pati,ala,▁to,▁study,▁at,▁Tha,par,▁University,</s>
Tags,O,B-PER,I-PER,O,O,O,B-LOC,I-LOC,O,O,O,B-ORG,I-ORG,I-ORG,O
