In [None]:
!pip install -q datasets

In [None]:
from datasets import load_dataset

In [None]:
langs = ["de", "fr", "it", "en"]
fracs= [0.629, 0.229, 0.084, 0.059]

In [None]:
from collections import defaultdict

In [None]:
from datasets import DatasetDict

panx_ch = defaultdict(DatasetDict)

In [None]:
for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (
            ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows))))

In [None]:
import pandas as pd

In [None]:
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
            index=["number of training examples"])

In [None]:
element = panx_ch["de"]["train"][0]

for key, value in element.items():
    print(f"{key}: {value}")

In [None]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

In [None]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

In [None]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [None]:
panx_de = panx_ch["de"].map(create_tag_names)

In [None]:
de_example = panx_de["train"][0]

pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
            ["Tokens", "Tags"])

In [None]:
from collections import Counter

In [None]:
split2freqs = defaultdict(Counter)

In [None]:
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] +=1

In [None]:
pd.DataFrame.from_dict(split2freqs, orient = "index")

In [None]:
from transformers import AutoTokenizer

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
text = "Tim Sparrow lives San Diego!"

In [None]:
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [None]:
pd.DataFrame([bert_tokens, xlmr_tokens], index = ["BERT","XLM-R"])

In [None]:
from transformers import XLMRobertaForTokenClassification

In [None]:
tags

In [None]:
xlmr_model_name = "xlm-roberta-base"
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
num_labels = tags.num_classes

In [None]:
index2tag

In [None]:
num_labels

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(
    xlmr_model_name,
    num_labels = num_labels,
    id2label=index2tag,
    label2id=tag2index
).to(device)

In [None]:
text

In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors = "pt")

In [None]:
pd.DataFrame(
    [xlmr_tokens, input_ids[0].numpy()],
    index = ["Tokens", "Input IDs"]
)

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits

In [None]:
outputs.shape

In [None]:
predictions = torch.argmax(outputs, dim=-1)

In [None]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]

In [None]:
pd.DataFrame(
    [xlmr_tokens, preds], index = ["Tokens", "Tags"]
)

In [None]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer.encode(
        text, return_tensors = "pt").to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index = ["Tokens", "Tags"])