# Language Transfer and NER

## Load in data

In [None]:
from datasets import get_dataset_config_names
xtreme_subsets = get_dataset_config_names('xtreme')
print(f'XTREME has {len(xtreme_subsets)} configurations')

In [None]:
# Narrow down those that start with PAN
panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
# Print the slice 
panx_subsets[:3]

## Load in the German corpus

In [None]:
from datasets import load_dataset
load_dataset('xtreme', name='PAN-X.de')

In [None]:
from collections import defaultdict
from datasets import DatasetDict

# Specify languages we are going to be working with 
langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)

# Loop to shuffle and downsample by our fractions
for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}') #Use f string to load each dataset in the loop
    # Shuffle and downsample each split
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows)))
        )

In [None]:
import pandas as pd
train_lang_df = pd.DataFrame({
    lang: [panx_ch[lang]['train'].num_rows] for lang in langs},
    index=['Number of training examples']
)

print(train_lang_df)

In [None]:
element = panx_ch['de']['train'][0]
for key, value in element.items():
    print(f'{key}: {value}')

In [None]:
for key, value in panx_ch['de']['train'].features.items():
    print(f'{key}:{value}')

In [None]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

Use *int2str()* to convert integers back to strings

In [None]:
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)
print(panx_de)

In [None]:
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']])

In [None]:
from collections import Counter
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split('-')[1]
                split2freqs[split][tag_type] +=1

pd.DataFrame.from_dict(split2freqs, orient='index')

## Multilingual Transformers

In [None]:
from transformers import AutoTokenizer
bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [None]:
text = 'Jack Sparrow loves New York!'
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

## Create a Custom Model for Token Classification

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [None]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Setup token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

## Loading custom model

In [None]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
from transformers import AutoConfig
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model = (
    XLMRobertaForTokenClassification
    .from_pretrained(xlmr_model_name, config=xlmr_config)
    .to(device)
)

In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Input IDs'])

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f'Number of tokens in sequence: {len(xlmr_tokens)}')
print(f'Shape of outputs: {outputs.shape}')

In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors='pt').input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=['Tokens', 'Tags'])


In [None]:
words, labels = de_example['tokens'], de_example['ner_tags']

In [None]:
tokenized_input = xlmr_tokenizer(de_example['tokens'], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
pd.DataFrame([tokens], index=['Tokens'])

In [None]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=['Tokens', 'Word IDs'])

In [None]:
# prev_word_idx = None
# label_ids = []

# for word_idx in word_ids:
#     if word_idx is None or word_idx == prev_word_idx:
#         # Map special token
#         label_ids.append(-100)
#     elif word_idx != prev_word_idx:
#         label_ids.append(labels[word_idx])
#     prev_word_idx = word_idx

# labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
# print(labels)
# index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
# pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)