In [1]:
!pip install dataset
!pip install evaluate
!pip install seqeval
!pip install transformers --upgrade

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.52-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: banal, sqlalchemy, Mako, alembic, dataset

In [2]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [32]:
import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import evaluate

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
def read_and_process_data(file_path, labels):
    # Чтение файла и разделение его на строки
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in id2label.items()}

    # Создание списка для хранения текста и меток
    texts = []
    labels = []

    # Проход по строкам файла и разделение на текст и метки
    text = []
    label = []
    for line in lines:
        line = line.strip().split()
        if line:
            word, word_label = line
            text.append(word)
            label.append(label2id[word_label])
        elif text and label:
            texts.append(text)
            labels.append(label)
            text = []
            label = []
    if text and label:
            texts.append(text)
            labels.append(label)
            # Пустая строка разделяет примеры
    # Создание набора данных
    dataset = Dataset.from_dict({"tokens": texts, "ner_tags": labels})
    return dataset, label2id, id2label

In [99]:
label_list = ["O", "B-VIOLATION", "I-VIOLATION", "B-MISSING-BEFORE", "B-MISSING-AFTER"]
dataset, label2id, id2label = read_and_process_data("/content/drive/MyDrive/Хакатон УрФ 2024/Data.txt", label_list)

In [100]:
dataset = dataset.train_test_split(test_size=1/6)

In [101]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 4
    })
})

In [128]:
model_path = "KoichiYasuoka/bert-base-russian-upos"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list), ignore_mismatched_sizes=True, id2label=id2label, label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at KoichiYasuoka/bert-base-russian-upos and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([89, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([89]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [129]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [130]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [131]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [132]:
seqeval = evaluate.load("seqeval")

In [133]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [153]:
training_args = TrainingArguments(
    output_dir="TokenClassifierResults",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=200,
    weight_decay=0.01,
    save_strategy="steps",
    evaluation_strategy = 'steps',
    logging_strategy = 'steps',
    eval_steps = 10, # Evaluation and Save happens every 10 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained("/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel")
tokenizer.save_pretrained("/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel")



Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,No log,1.161892,0.0,0.0,0.0,0.86121
20,No log,1.02763,0.0,0.0,0.0,0.857651
30,No log,1.113064,0.0,0.0,0.0,0.86121
40,No log,1.156984,0.0,0.0,0.0,0.86121
50,No log,1.145696,0.0,0.0,0.0,0.864769
60,No log,1.172357,0.0,0.0,0.0,0.864769
70,No log,1.183761,0.0,0.0,0.0,0.868327
80,No log,1.193548,0.0,0.0,0.0,0.868327
90,No log,1.138003,0.0,0.0,0.0,0.864769
100,No log,1.199569,0.0,0.0,0.0,0.864769


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel/tokenizer_config.json',
 '/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel/special_tokens_map.json',
 '/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel/vocab.txt',
 '/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel/added_tokens.json',
 '/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel/tokenizer.json')

In [108]:
inputs = tokenizer(
            dataset["train"][2]["tokens"],
            truncation=True,
            is_split_into_words=True,
            return_tensors='pt',
        )
inputs.to("cuda")

{'input_ids': tensor([[   101,  18371,    145,  75275,  27070,   4007,    130,  54333, 109258,
           2325,  18371,    145,  27070,   4007,    130,  54333, 109258,   2325,
           9257,  62475,  77489,    845,   7769, 109258,   2325, 105282,   8953,
            128,    625,    130,    625,    128,  12588,  86501,    128,    625,
            130,    625,   7382,   1916,  12588,   7309,   3049,   5806,   3955,
           9688,   1580,  54333, 109258,   1636,   5022,  29526,   9798,  16636,
            128,  85616,    896,   1699,  17530,    842,  38322,  15529,  10766,
         111267,    822,  45113,    132,  38322,  15529,  45113,    132,  71576,
            102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]], device='cuda:0'), 'attenti

In [172]:
from transformers import pipeline
classifier = pipeline("ner", model="/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel", tokenizer="/content/drive/MyDrive/Хакатон УрФ 2024/TokenClassifierModel")

In [183]:
classifier("Здравствуйте, на приближении станции Уралтау. 4 маршрут, проходной Уралтау, слушаю.Здравствуйте, по станции Уралтау, входной вам открыт, на 4-й путь, выходной, судоход запрещающий, на 1-й путь нечетный прибудет, проходы будут, буду пропускать по 4-му ДНЦ Мухамедьярова.Здравствуйте, понятно, Уралтау, маршрут на 4-й, боковой, свободный путь, выходной, судоход запрещающий, по прибытию нечетного будете дальше спрятать, ДНЦ Мухамедьярова.Верно.")

[{'entity': 'B-VIOLATION',
  'score': 0.99206334,
  'index': 20,
  'word': 'Здравствуй',
  'start': 83,
  'end': 93},
 {'entity': 'B-VIOLATION',
  'score': 0.60090625,
  'index': 68,
  'word': 'Здравствуй',
  'start': 269,
  'end': 279},
 {'entity': 'B-VIOLATION',
  'score': 0.6967991,
  'index': 97,
  'word': 'дальше',
  'start': 402,
  'end': 408}]

In [171]:
import torch

n = 1
inputs = tokenizer(dataset["test"][n]["tokens"], truncation=True, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs.to("cuda")).logits

predictions = torch.argmax(logits, dim=2)
predicted_token_class = list(zip((i.item() for i in predictions[0]), dataset["test"][n]["ner_tags"]))
predicted_token_class

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (2, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 2),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 2),
 (0, 2)]

In [144]:
dataset["train"][1]["ner_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
model.config.id2label

{'O': 0,
 'B-VIOLATION': 1,
 'I-VIOLATION': 2,
 'B-MISSING-BEFORE': 3,
 'B-MISSING-AFTER': 4}