In [None]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
ner_features = dataset['train'].features['ner_tags']
ner_features

In [None]:
label_names = ner_features.feature.names
label_names

In [None]:
print(len(label_names))

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
tokenizer.is_fast

In [None]:
inputs = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)
inputs.tokens()

In [None]:
inputs = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True, truncation=True, padding="max_length", max_length=32, return_tensors='pt')
inputs.tokens()

In [None]:
inputs.word_ids()

In [None]:
inputs

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)
    return new_labels

In [None]:
labels = dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=False, max_length=256
    )

    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
from transformers import DataCollatorForTokenClassification
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)
tokenized_datasets["train"].set_format("pt")
tokenized_datasets["validation"].set_format("pt")
tokenized_datasets["test"].set_format("pt")

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets['train'][0]

In [None]:
tokenized_datasets['validation'][0]

In [None]:
tokenized_datasets['test'][0]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True, collate_fn=collator)

In [None]:
batch = next(iter(train_dataloader))

In [None]:
batch['input_ids'].shape

In [None]:
batch

In [None]:
for i in range(2):
    print(dataset['train'][i]['tokens'])
    print(tokenizer(dataset['train'][i]['tokens'], is_split_into_words=True, truncation=True).tokens())
    print(tokenized_datasets["train"][i]["labels"])
    print(tokenized_datasets["train"][i]["input_ids"])

In [None]:
from transformers import AutoModel
encoder = AutoModel.from_pretrained("xlm-roberta-base", add_pooling_layer=False)

In [None]:
test = dict()
test['input_ids'] = batch['input_ids']
test['attention_mask'] = batch['attention_mask']
output = encoder(**test)
print(output)

In [None]:
output.last_hidden_state.shape

In [None]:
import torch

dense = torch.nn.Linear(768, len(label_names))

In [None]:
res = dense(output.last_hidden_state)
res

In [None]:
res.shape

In [None]:
res

In [None]:
batch['labels']

In [None]:
batch['labels'].shape

In [None]:
import torch.nn.functional as F

loss = F.cross_entropy(res.permute(0,2,1), batch['labels'])
loss

In [None]:
x, y = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['labels']
x, y