In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForTokenClassification, AutoTokenizer, get_scheduler, DataCollatorForTokenClassification
from datasets import load_dataset
from accelerate import Accelerator

import evaluate
metric = evaluate.load("seqeval")

from tqdm.auto import tqdm

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
train_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv'
test_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/test_data.csv'
data_files = {'train': train_path, 'test': test_path}

In [10]:
local_dataset = load_dataset('csv', data_files = data_files)

Found cached dataset csv (/home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
local_dataset = local_dataset.map(lambda x: {'augmented_ner_tags': [literal_eval(_) for _ in x['augmented_ner_tags']]}, batched=True, num_proc=16)
local_dataset = local_dataset.map(lambda x: {'augmented_tokens': [literal_eval(_) for _ in x['augmented_tokens']]}, batched=True, num_proc=16)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2ef9f8d1ce7e0ca0_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d81918492de155fe_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1b8b956640a6ff2a_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-26420b5be10174f6_*_of_00016.arrow


In [12]:
local_dataset = local_dataset.rename_columns({'augmented_tokens': 'tokens',
                                             'augmented_ner_tags': 'ner_tags',
                                             'sentence': 'sentence'})

In [9]:
# tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [None]:
# def get_training_corpus():
#     return (
#         local_dataset["train"][i : i + 1000]["sentence"]
#         for i in range(0, len(local_dataset["train"]), 1000)
#     )

# training_corpus = get_training_corpus()

In [None]:
# new_tokenizer = tokenizer.train_new_from_iterator(training_corpus, 52000)

In [None]:
# inputs = new_tokenizer(test_sent, is_split_into_words=True)
# inputs.tokens()

In [None]:
# new_tokenizer.save_pretrained("deeppavlov_tokenizer")

In [10]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [12]:
tokenized_dataset = local_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=local_dataset["train"].column_names,
    num_proc=16
)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0666774e54c87a50_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-71515305476e0e87_*_of_00016.arrow


In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 205710
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 68571
    })
})

In [16]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [17]:
model = AutoModelForTokenClassification.from_pretrained("DeepPavlov/rubert-base-cased",
                                                        id2label=id2label, label2id=label2id,
                                                       num_labels = 9)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
    num_workers=16
)
test_dataloader = DataLoader(
    tokenized_dataset["test"],
    collate_fn=data_collator,
    batch_size=4,
    num_workers=16
)

In [34]:
optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, test_dataloader
)

In [21]:
num_train_epochs = 4
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [22]:
model_name = "DeepPavlov_RuBERT_finetuned"
output_dir = "DeepPavlov_RuBERT_finetuned_with_accelerate"

In [23]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [24]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    for batch in test_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/205712 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

epoch 0: {'precision': 0.8483710657095528, 'recall': 0.811218940409939, 'f1': 0.829379153238649, 'accuracy': 0.9609180578924567}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

epoch 1: {'precision': 0.8491551628934291, 'recall': 0.8493896713615023, 'f1': 0.8492724009388376, 'accuracy': 0.9646156791230491}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

epoch 2: {'precision': 0.8569519602429597, 'recall': 0.8605729241757145, 'f1': 0.8587586252690641, 'accuracy': 0.9669539795606514}


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

epoch 3: {'precision': 0.8678078409718387, 'recall': 0.8562617820444367, 'f1': 0.861996149648148, 'accuracy': 0.9671571640151935}


#### Performance Comparison:

In [26]:
# NOT FINETUNED MODEL:
model = AutoModelForTokenClassification.from_pretrained("DeepPavlov/rubert-base-cased",
                                                        id2label=id2label, label2id=label2id,
                                                       num_labels = 9)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [28]:
progress_bar = tqdm(range(len(test_dataloader)))
model.eval()
for batch in test_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    # Necessary to pad predictions and labels for being gathered
    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)
    
    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)
    progress_bar.update(1)

  0%|          | 0/17143 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [29]:
results = metric.compute()

In [30]:
results

{'LOC': {'precision': 0.005319430476896205,
  'recall': 0.002920747478857276,
  'f1': 0.0037709654421972908,
  'number': 68818},
 'MISC': {'precision': 0.034083246490529025,
  'recall': 0.0032761577033898807,
  'f1': 0.005977723304519847,
  'number': 169711},
 'ORG': {'precision': 0.023833167825223437,
  'recall': 0.0013109298778566296,
  'f1': 0.0024851646819865386,
  'number': 237999},
 'PER': {'precision': 0.014683219178082192,
  'recall': 0.0008109475555723682,
  'f1': 0.0015370069142905793,
  'number': 422962},
 'overall_precision': 0.015593594699061292,
 'overall_recall': 0.0015697784299992217,
 'overall_f1': 0.0028524100036362164,
 'overall_accuracy': 0.0452706719192367}

In [31]:
model_check = "DeepPavlov_RuBERT_finetuned_with_accelerate/"
model = AutoModelForTokenClassification.from_pretrained(model_check, id2label=id2label, label2id=label2id)

In [35]:
progress_bar = tqdm(range(len(test_dataloader)))
model.eval()
for batch in test_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    # Necessary to pad predictions and labels for being gathered
    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)
    
    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)
    progress_bar.update(1)

  0%|          | 0/17143 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [36]:
results = metric.compute()

In [37]:
results

{'LOC': {'precision': 0.9240459429418303,
  'recall': 0.9107650572553929,
  'f1': 0.9173574346780867,
  'number': 38337},
 'MISC': {'precision': 0.7302151658186722,
  'recall': 0.7065662257547898,
  'f1': 0.7181960689738333,
  'number': 16859},
 'ORG': {'precision': 0.7987166755786418,
  'recall': 0.7994494991971863,
  'f1': 0.7990829193733283,
  'number': 13079},
 'PER': {'precision': 0.9116438356164384,
  'recall': 0.9063670411985019,
  'f1': 0.908997780433669,
  'number': 23496},
 'overall_precision': 0.8678078409718387,
 'overall_recall': 0.8562617820444367,
 'overall_f1': 0.861996149648148,
 'overall_accuracy': 0.9671571640151935}

#### Тут бло 4 эпохи, на 3й качество отставало от babelscape