In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForTokenClassification, AutoTokenizer, get_scheduler, pipeline, DataCollatorForTokenClassification
from datasets import load_dataset
from accelerate import Accelerator

import evaluate
metric = evaluate.load("seqeval")

from tqdm.auto import tqdm
# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [4]:
train_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/train_data.csv'
test_path = '/home/sergey/Python_projects/RU_NER/Project_Data/Data/test_data.csv'
data_files = {'train': train_path, 'test': test_path}

In [5]:
local_dataset = load_dataset('csv', data_files = data_files)

Found cached dataset csv (/home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
local_dataset = local_dataset.map(lambda x: {'augmented_ner_tags': [literal_eval(_) for _ in x['augmented_ner_tags']]}, batched=True, num_proc=16)
local_dataset = local_dataset.map(lambda x: {'augmented_tokens': [literal_eval(_) for _ in x['augmented_tokens']]}, batched=True, num_proc=16)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2ef9f8d1ce7e0ca0_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d81918492de155fe_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1b8b956640a6ff2a_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-26420b5be10174f6_*_of_00016.arrow


In [7]:
local_dataset

DatasetDict({
    train: Dataset({
        features: ['augmented_tokens', 'sentence', 'augmented_ner_tags'],
        num_rows: 205710
    })
    test: Dataset({
        features: ['augmented_tokens', 'sentence', 'augmented_ner_tags'],
        num_rows: 68571
    })
})

In [8]:
local_dataset = local_dataset.remove_columns('sentence')

In [9]:
local_dataset = local_dataset.rename_columns({'augmented_tokens': 'tokens',
                                             'augmented_ner_tags': 'ner_tags'})

In [10]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")

In [11]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [13]:
tokenized_dataset = local_dataset.map(tokenize_and_align_labels, batched=True, num_proc=16)

Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-945d44bd5536c794_*_of_00016.arrow
Loading cached processed dataset at /home/sergey/.cache/huggingface/datasets/csv/default-f5b328fb0b36accf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6a2143c5df79d6ca_*_of_00016.arrow


In [14]:
tokenized_dataset = tokenized_dataset.remove_columns(local_dataset['train'].column_names)

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [8]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [9]:
model = AutoModelForTokenClassification.from_pretrained(
    "Babelscape/wikineural-multilingual-ner", id2label=id2label, label2id=label2id)

In [10]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [18]:
train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
    num_workers=16
)
test_dataloader = DataLoader(
    tokenized_dataset["test"],
    collate_fn=data_collator,
    batch_size=16,
    num_workers=16
)

In [62]:
optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, test_dataloader
)

In [None]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [21]:
model_name = "babelscap-bert-finetuned-ner-accelerate"
output_dir = "babelscape-bert-finetuned-ner-accelerate"

In [22]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
# training loop
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in test_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

#### Performance Comparison:

In [58]:
# NOT FINETUNED MODEL:
model_check = 'Saved_models/Babelscape_wikineural/'
model = AutoModelForTokenClassification.from_pretrained(model_check, id2label=id2label, label2id=label2id)

In [55]:
progress_bar = tqdm(range(len(test_dataloader)))
model.eval()
for batch in test_dataloader:
    with torch.no_grad():
#         input_ids
#         token_type_ids
#         attention_mask
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    # Necessary to pad predictions and labels for being gathered
    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)
    
    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)
    progress_bar.update(1)

  0%|          | 0/4286 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [56]:
results = metric.compute()

In [57]:
# NOT FINETUNED RESULTS:
results

{'LOC': {'precision': 0.16588154342878314,
  'recall': 0.3356718256305896,
  'f1': 0.22203723055668714,
  'number': 18673},
 'MISC': {'precision': 0.22448354073438362,
  'recall': 0.07766372582287072,
  'f1': 0.11540219018356575,
  'number': 47152},
 'ORG': {'precision': 0.19547780918188068,
  'recall': 0.3333767587285044,
  'f1': 0.24644869263735733,
  'number': 7676},
 'PER': {'precision': 0.056378424657534246,
  'recall': 0.1952846975088968,
  'f1': 0.08749667818230135,
  'number': 6744},
 'overall_precision': 0.15246824958586416,
 'overall_recall': 0.1720481026855256,
 'overall_f1': 0.16166749612108083,
 'overall_accuracy': 0.8288791050702202}

In [60]:
model_check = 'babelscape-bert-finetuned-ner-accelerate/'
model = AutoModelForTokenClassification.from_pretrained(model_check, id2label=id2label, label2id=label2id)

In [69]:
metric_finetuned = evaluate.load("seqeval")

In [72]:
progress_bar = tqdm(range(len(test_dataloader)))
model.eval()
for batch in test_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    # Necessary to pad predictions and labels for being gathered
    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)
    
    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric_finetuned.add_batch(predictions=true_predictions, references=true_labels)
    progress_bar.update(1)

  0%|          | 0/4286 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

In [73]:
results = metric_finetuned.compute()

In [74]:
results

{'LOC': {'precision': 0.9186735828084476,
  'recall': 0.9041256446319738,
  'f1': 0.9113415594644263,
  'number': 38394},
 'MISC': {'precision': 0.7029363084656409,
  'recall': 0.6917833011583011,
  'f1': 0.6973152117729332,
  'number': 16576},
 'ORG': {'precision': 0.7851195477809182,
  'recall': 0.7886135195273536,
  'f1': 0.7868626550298576,
  'number': 13033},
 'PER': {'precision': 0.9071917808219178,
  'recall': 0.8949702267832257,
  'f1': 0.9010395629158784,
  'number': 23679},
 'overall_precision': 0.8575372722252899,
 'overall_recall': 0.8469492375820772,
 'overall_f1': 0.8522103691997014,
 'overall_accuracy': 0.9676213772160698}