In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForTokenClassification, AutoTokenizer, get_scheduler, pipeline, DataCollatorForTokenClassification
from datasets import load_dataset
from accelerate import Accelerator

import evaluate
metric = evaluate.load("seqeval")

from tqdm.auto import tqdm

#### Тут не буду особо комментировать, весь код - копия других 3 тетрадок

In [2]:
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [3]:
train_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv'
test_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/test_data.csv'
data_files = {'train': train_path, 'test': test_path}

In [4]:
local_dataset = load_dataset('csv', data_files = data_files)

Found cached dataset csv (/home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
local_dataset = local_dataset.map(lambda x: {'augmented_ner_tags': [literal_eval(_) for _ in x['augmented_ner_tags']]}, batched=True, num_proc=16)
local_dataset = local_dataset.map(lambda x: {'augmented_tokens': [literal_eval(_) for _ in x['augmented_tokens']]}, batched=True, num_proc=16)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2ef9f8d1ce7e0ca0_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d81918492de155fe_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1b8b956640a6ff2a_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-26420b5be10174f6_*_of_00016.arrow


In [6]:
local_dataset = local_dataset.remove_columns('sentence')

In [7]:
local_dataset = local_dataset.rename_columns({'augmented_tokens': 'tokens',
                                             'augmented_ner_tags': 'ner_tags'})

In [8]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [9]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [11]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [15]:
def evaluation(model, dataloader):
    progress_bar = tqdm(range(len(dataloader)))
    metric = evaluate.load('seqeval')
    accelerator = Accelerator()
    model, dataloader = accelerator.prepare(model, dataloader)
    
    model.eval()
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        
        preds = outputs.logits.argmax(dim=-1)
        labels = batch['labels']
        
        preds = accelerator.pad_across_processes(preds, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
        preds_gathered = accelerator.gather(preds)
        labels_gathered = accelerator.gather(labels)
        
        true_preds, true_labels = postprocess(preds_gathered, labels_gathered)
        metric.add_batch(predictions=true_preds, references=true_labels)
        progress_bar.update(1)
        
    results = metric.compute()
    return results

Babelscape
___

In [13]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
babelscape_dataset_tokenized = local_dataset.map(tokenize_and_align_labels, batched=True, num_proc=16)
babelscape_dataset_tokenized = babelscape_dataset_tokenized.remove_columns(local_dataset['train'].column_names)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

babelscape_test_dataloader = DataLoader(
    babelscape_dataset_tokenized["test"],
    collate_fn=data_collator,
    batch_size=16,
    num_workers=16
)

model_checkpoint = 'babelscape-bert-finetuned-ner-accelerate/'
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-945d44bd5536c794_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6a2143c5df79d6ca_*_of_00016.arrow


In [16]:
babelscape_result = evaluation(model, babelscape_test_dataloader)

  0%|          | 0/4286 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [19]:
babelscape_result

{'LOC': {'precision': 0.9186735828084476,
  'recall': 0.9041256446319738,
  'f1': 0.9113415594644263,
  'number': 38394},
 'MISC': {'precision': 0.7029363084656409,
  'recall': 0.6917833011583011,
  'f1': 0.6973152117729332,
  'number': 16576},
 'ORG': {'precision': 0.7851195477809182,
  'recall': 0.7886135195273536,
  'f1': 0.7868626550298576,
  'number': 13033},
 'PER': {'precision': 0.9071917808219178,
  'recall': 0.8949702267832257,
  'f1': 0.9010395629158784,
  'number': 23679},
 'overall_precision': 0.8575372722252899,
 'overall_recall': 0.8469492375820772,
 'overall_f1': 0.8522103691997014,
 'overall_accuracy': 0.9676213772160698}

RoBERTa
___

In [25]:
tokenizer = AutoTokenizer.from_pretrained("XLM_RoBERTa_finetuned_with_accelerate")
roberta_dataset_tokenized = local_dataset.map(tokenize_and_align_labels, batched=True, num_proc=16)
roberta_dataset_tokenized = roberta_dataset_tokenized.remove_columns(local_dataset['train'].column_names)


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
roberta_test_dataloader = DataLoader(
    roberta_dataset_tokenized["test"],
    collate_fn=data_collator,
    batch_size=64,
    num_workers=16
)

model_checkpoint = 'XLM_RoBERTa_finetuned_with_accelerate/'
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e913e3bd01f366b7_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-53332ecab9594ee0_*_of_00016.arrow


In [27]:
roberta_result = evaluation(model, roberta_test_dataloader)

  0%|          | 0/1072 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

In [28]:
roberta_result

{'LOC': {'precision': 0.8026782406182184,
  'recall': 0.775187854623524,
  'f1': 0.78869357187435,
  'number': 39126},
 'MISC': {'precision': 0.44436952124072826,
  'recall': 0.5462697814619443,
  'f1': 0.4900787614508333,
  'number': 13270},
 'ORG': {'precision': 0.6095791001451378,
  'recall': 0.6071211199026172,
  'f1': 0.6083476272155517,
  'number': 13144},
 'PER': {'precision': 0.6994006849315069,
  'recall': 0.6698372350457136,
  'f1': 0.6842998052396808,
  'number': 24391},
 'overall_precision': 0.6835670900055218,
 'overall_recall': 0.6882721197362422,
 'overall_f1': 0.6859115363944128,
 'overall_accuracy': 0.939897871961087}

DeepPavlov
___

In [31]:

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov_RuBERT_finetuned_with_accelerate/')
rubert_dataset_tokenized = local_dataset.map(tokenize_and_align_labels, batched=True, num_proc=16)
rubert_dataset_tokenized = rubert_dataset_tokenized.remove_columns(local_dataset['train'].column_names)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

rubert_test_dataloader = DataLoader(
    rubert_dataset_tokenized["test"],
    collate_fn=data_collator,
    batch_size=16,
    num_workers=16
)

model_checkpoint = 'DeepPavlov_RuBERT_finetuned_with_accelerate/'
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5f0098d42d615fc8_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a71f4d5a68b55cfc_*_of_00016.arrow


In [33]:
rubert_result = evaluation(model, rubert_test_dataloader)

  0%|          | 0/4286 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [34]:
rubert_result

{'LOC': {'precision': 0.9240459429418303,
  'recall': 0.9107650572553929,
  'f1': 0.9173574346780867,
  'number': 38337},
 'MISC': {'precision': 0.7302151658186722,
  'recall': 0.7065662257547898,
  'f1': 0.7181960689738333,
  'number': 16859},
 'ORG': {'precision': 0.7987166755786418,
  'recall': 0.7994494991971863,
  'f1': 0.7990829193733283,
  'number': 13079},
 'PER': {'precision': 0.9116438356164384,
  'recall': 0.9063670411985019,
  'f1': 0.908997780433669,
  'number': 23496},
 'overall_precision': 0.8678078409718387,
 'overall_recall': 0.8562617820444367,
 'overall_f1': 0.861996149648148,
 'overall_accuracy': 0.9671571640151935}

In [79]:
result_df = pd.DataFrame([rubert_result, roberta_result, babelscape_result], index=['rubert', 'roberta', 'babelscape'])

In [81]:
overall_result_df = result_df[['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']]

In [82]:
overall_result_df

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy
rubert,0.867808,0.856262,0.861996,0.967157
roberta,0.683567,0.688272,0.685912,0.939898
babelscape,0.857537,0.846949,0.85221,0.967621


#### Выводы и дальнейшие шаги
- babelscape я изначально обучал с дефолтным токенизатором, поскольку он адекватно разбивал тексты (не бил каждое слово по буквам), хочу попробовать обучить новый токенизатор, и с ним уже дообучить модель
- на первый взгляд роберта хуже других, но, ее обучение можно смело продолжить, поскольку 3 эпох было явно мало, качество продолжало стабильно быстро расти (начиналось совсем с низкого)
- в целом f1 в 0.86 на таких дико грязных данных это неплохо. Я думаю его можно подтянуть к 90, если добавить эпох / дообучить токенизатор у babelscape, так как качество на 3 эпохе продолжало расти.