In [1]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "ru")

  from .autonotebook import tqdm as notebook_tqdm


Отображаем классы выбранного датасета

In [2]:
label_list = dataset['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

Следующим шагом является загрузка токенайзера выбранной модели для обработки "поля токенов" (tokens field).

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [4]:
example = dataset['train'][0]
example

{'tokens': ['Илизаров', ',', 'Гавриил', 'Абрамович'],
 'ner_tags': [1, 2, 2, 2],
 'langs': ['ru', 'ru', 'ru', 'ru'],
 'spans': ['PER: Илизаров , Гавриил Абрамович']}

In [5]:
tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)
tokenized_input

{'input_ids': [101, 35377, 31332, 1388, 128, 56031, 41439, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]', 'Или', '##зар', '##ов', ',', 'Гавриил', 'Абрамович', '[SEP]']

In [7]:
###----------
### TEST CELL
###----------

tokenized_input = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(example[f"ner_tags"]):
    word_ids = tokenized_input.word_ids(batch_index=i)  # Map tokens to their respective word.
    print(word_ids)
    break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[None, 0, 0, 0, 1, 2, 3, None]


In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_ru_wikiann = dataset.map(tokenize_and_align_labels, batched=True)

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

In [12]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
id2label = dict(zip([i for i in range(len(label_list))], label_list))
label2id = {value: key for key, value in id2label.items()}

In [14]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased", num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="my_awesome_tokenized_ru_wikiann_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ru_wikiann["train"],
    eval_dataset=tokenized_ru_wikiann["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1786,0.145926,0.880746,0.900057,0.890297,0.960919
2,0.105,0.144565,0.895697,0.907613,0.901615,0.963851


Checkpoint destination directory my_awesome_tokenized_ru_wikiann_model\checkpoint-1250 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_tokenized_ru_wikiann_model\checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2500, training_loss=0.16737534332275392, metrics={'train_runtime': 442.6346, 'train_samples_per_second': 90.368, 'train_steps_per_second': 5.648, 'total_flos': 525532360870176.0, 'train_loss': 0.16737534332275392, 'epoch': 2.0})

In [19]:
# Сохраняем модель
save_directory = './pt_save_pretrained'
#tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)