In [8]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import transformers
from transformers import Trainer
from datasets import load_metric
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

## Helpful funcs 

In [9]:
def align_labels_with_tokens(labels: list, word_ids: list) -> list:
    """
    Repeat label for each splitted token

    :param labels: list of entities token
    :type labels: list
    :param word_ids: list of word ids (repeadted if word was splitted)
    :type word_ids: list
    :return: list of aligned labels for tokenized sequence
    :rtype: list
    """
    return [-100 if i is None else labels[i] for i in word_ids]

def tokenize_and_align_labels(examples):
    """
    Tokenizing input sequence with corresponding labels

    :param examples: DatasetDict object with sequences and label ids
    :type examples: DatasetDict
    :return: DatasetDict with tokenizer output
    :rtype: DatasetDict
    """
    tokenized_inputs = tokenizer(
        examples["sequences"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ids"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def compute_metrics(eval_preds):
    """
    Function for evaluate model
    
    :param eval_preds: model output
    :type eval_preds: 
    """
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)
                        ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

## Load Data

In [10]:
raw_datasets = load_dataset("surdan/nerel_short")

Downloading:   0%|          | 0.00/924 [00:00<?, ?B/s]

Using custom data configuration surdan--for_test_v3-ebe3ed24b6dbf0bb


Downloading and preparing dataset None/None (download: 644.03 KiB, generated: 2.67 MiB, post-processed: Unknown size, total: 3.30 MiB) to /home/maksim/.cache/huggingface/datasets/surdan___parquet/surdan--for_test_v3-ebe3ed24b6dbf0bb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/115k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/435k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /home/maksim/.cache/huggingface/datasets/surdan___parquet/surdan--for_test_v3-ebe3ed24b6dbf0bb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
raw_datasets

DatasetDict({
    dev: Dataset({
        features: ['sequences', 'ids'],
        num_rows: 536
    })
    test: Dataset({
        features: ['sequences', 'ids'],
        num_rows: 512
    })
    train: Dataset({
        features: ['sequences', 'ids'],
        num_rows: 2508
    })
})

## Preprocess data

In [12]:
model_checkpoint = "cointegrated/LaBSE-en-ru"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)

In [19]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [20]:
tokenized_datasets

DatasetDict({
    dev: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 536
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 512
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2508
    })
})

## Init Training pipeline

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
id2label = {str(k): v for k, v in map_id_to_label.items()}
label2id = {v: k for k, v in id2label.items()}
label_names = list(id2label.values())

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

In [None]:
args = TrainingArguments(
    "LaBSE_ner_nerel",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    per_device_train_batch_size = 4 ## depending on the total volume of memory of your GPU
)

## Train model

In [None]:
## for compute_metrics function
metric = load_metric("seqeval")

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.save_model("LaBSE_nerel_last_checkpoint")