In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForTokenClassification, AutoTokenizer, get_scheduler, DataCollatorForTokenClassification
from datasets import load_dataset
from accelerate import Accelerator

import evaluate
metric = evaluate.load("seqeval")

from tqdm.auto import tqdm

In [4]:
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
train_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv'
test_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/test_data.csv'
data_files = {'train': train_path, 'test': test_path}

In [6]:
local_dataset = load_dataset('csv', data_files = data_files)

Found cached dataset csv (/home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
local_dataset = local_dataset.map(lambda x: {'augmented_ner_tags': [literal_eval(_) for _ in x['augmented_ner_tags']]}, batched=True, num_proc=16)
local_dataset = local_dataset.map(lambda x: {'augmented_tokens': [literal_eval(_) for _ in x['augmented_tokens']]}, batched=True, num_proc=16)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2ef9f8d1ce7e0ca0_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d81918492de155fe_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1b8b956640a6ff2a_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-26420b5be10174f6_*_of_00016.arrow


In [8]:
local_dataset = local_dataset.rename_columns({'augmented_tokens': 'tokens',
                                             'augmented_ner_tags': 'ner_tags',
                                             'sentence': 'sentence'})

In [8]:
# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
# model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base")

In [10]:
test_sent = local_dataset['train'][10]['tokens']
test_sent

['превращение',
 'урана',
 'облучённого',
 'медленными',
 'нейтронами',
 'в',
 'изотоп',
 'бария',
 'открыли',
 'отто',
 'ган',
 'и',
 'фриц',
 'штрассман',
 'а',
 'теоретическое',
 'объяснение',
 'открытия',
 'сформулировали',
 'лиза',
 'мейтнер',
 'и',
 'отто',
 'фриш']

In [10]:
inputs = tokenizer(test_sent, is_split_into_words=True)
inputs.tokens()

['<s>',
 '▁прев',
 'ращение',
 '▁уран',
 'а',
 '▁обл',
 'уч',
 'ён',
 'ного',
 '▁мед',
 'лен',
 'ными',
 '▁ней',
 'тро',
 'на',
 'ми',
 '▁в',
 '▁из',
 'о',
 'топ',
 '▁бар',
 'ия',
 '▁открыл',
 'и',
 '▁от',
 'то',
 '▁',
 'ган',
 '▁и',
 '▁фр',
 'иц',
 '▁шт',
 'рас',
 'сман',
 '▁а',
 '▁теоретическ',
 'ое',
 '▁объяс',
 'нение',
 '▁открытия',
 '▁с',
 'форм',
 'ул',
 'ировали',
 '▁',
 'лиза',
 '▁ме',
 'йт',
 'нер',
 '▁и',
 '▁от',
 'то',
 '▁фр',
 'иш',
 '</s>']

In [42]:
def get_training_corpus():
    return (
        local_dataset["train"][i : i + 1000]["sentence"]
        for i in range(0, len(local_dataset["train"]), 1000)
    )

training_corpus = get_training_corpus()

In [38]:
new_tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=128000)





In [39]:
tokenizer.vocab_size

250002

In [40]:
len(new_tokenizer.vocab)

128000

In [41]:
inputs = new_tokenizer(test_sent, is_split_into_words=True)
inputs.tokens()

['<s>',
 '▁превращение',
 '▁урана',
 '▁облуч',
 'ён',
 'ного',
 '▁медленным',
 'и',
 '▁нейтрон',
 'ами',
 '▁в',
 '▁изотоп',
 '▁бари',
 'я',
 '▁открыли',
 '▁отто',
 '▁ган',
 '▁и',
 '▁фриц',
 '▁штрасс',
 'ман',
 '▁',
 'а',
 '▁теоретическо',
 'е',
 '▁объяснение',
 '▁открытия',
 '▁сформулировал',
 'и',
 '▁лиза',
 '▁мейтнер',
 '▁и',
 '▁отто',
 '▁фриш',
 '</s>']

In [43]:
# new_tokenizer.save_pretrained("XLM_RoBERTA_tokenier/")

('XLM_RoBERTA_tokenier/tokenizer_config.json',
 'XLM_RoBERTA_tokenier/special_tokens_map.json',
 'XLM_RoBERTA_tokenier/tokenizer.json')

In [9]:
tokenizer = AutoTokenizer.from_pretrained("XLM_RoBERTA_tokenier/")

In [10]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [12]:
tokenized_dataset = local_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=local_dataset["train"].column_names,
    num_proc=16
)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8f742d2d366f693f_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-03e3e08134719f07_*_of_00016.arrow


In [13]:


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [17]:
model = AutoModelForTokenClassification.from_pretrained("XLM_RoBERTa_finetuned_with_accelerate/",
                                                        id2label=id2label, label2id=label2id,
                                                       num_labels = 9, ignore_mismatched_sizes=True)

In [18]:
train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
    num_workers=8
)
test_dataloader = DataLoader(
    tokenized_dataset["test"],
    collate_fn=data_collator,
    batch_size=8,
    num_workers=8
)

In [19]:
optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, test_dataloader
)

In [20]:
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [21]:
model_name = "XLM_RoBERTa_finetuned"
output_dir = "XLM_RoBERTa_finetuned_with_accelerate"

In [22]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [23]:
# training loop
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in test_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/25714 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

epoch 0: {'precision': 0.6835670900055218, 'recall': 0.6882721197362422, 'f1': 0.6859115363944128, 'accuracy': 0.939897871961087}
