In [1]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

fatal: destination path 'character-tokenizer' already exists and is not an empty directory.


Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).
Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
import pandas as pd
from pynvml import *
import numpy as np

In [4]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [5]:
df = pd.read_csv("all_accents.tsv", delimiter="\t")
df

Unnamed: 0,-де,-д^е
0,-ка,-к^а
1,-либо,-л^ибо
2,-нибудь,-ниб^удь
3,-с,-с
4,-таки,-так^и
...,...,...
1680529,ӂюль-верновский,ӂюль-в^ерновский
1680530,ӂюрить,ӂюр^ить
1680531,ӂӂение,ӂӂ^ение
1680532,ӂӂенный,ӂӂенный


In [6]:
count_without_caret = np.sum(['^' in s for s in df["-де"]])

print("Number of strings with '^':", count_without_caret)

Number of strings with '^': 0


In [7]:
count_without_caret = np.sum(['^' not in s for s in df["-д^е"]])

print("Number of strings without '^':", count_without_caret)

Number of strings without '^': 507


In [8]:
df_with_caret = df[df["-д^е"].str.contains(r'\^', na=False)]
df_with_caret

Unnamed: 0,-де,-д^е
0,-ка,-к^а
1,-либо,-л^ибо
2,-нибудь,-ниб^удь
4,-таки,-так^и
5,-то,-т^о
...,...,...
1680527,ѐльцинско-гайдаровский,ѐльцинско-гайд^аровский
1680528,ӂен-премьер,ӂен-премь^ер
1680529,ӂюль-верновский,ӂюль-в^ерновский
1680530,ӂюрить,ӂюр^ить


In [9]:
unique_chars = sorted(set("".join(df_with_caret["-д^е"])))

print("Unique characters:", unique_chars)

Unique characters: [' ', ',', '-', '.', '/', ';', '^', '_', 'c', 'g', 'h', '{', '~', '\xa0', '\xad', '·', 'ʔ', 'ʕ', '̣', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ѐ', 'і', 'ѣ', 'ӂ', '\u200e', '\u200f', '—']


### Лучше ограничиться современным русским алфавитом

In [10]:
allowed_chars = set(unique_chars[19:51] + ['-', '^', '—'])
allowed_chars

{'-',
 '^',
 'а',
 'б',
 'в',
 'г',
 'д',
 'е',
 'ж',
 'з',
 'и',
 'й',
 'к',
 'л',
 'м',
 'н',
 'о',
 'п',
 'р',
 'с',
 'т',
 'у',
 'ф',
 'х',
 'ц',
 'ч',
 'ш',
 'щ',
 'ъ',
 'ы',
 'ь',
 'э',
 'ю',
 'я',
 '—'}

In [11]:
len(allowed_chars)

35

In [12]:
df_filtered = df_with_caret[df_with_caret["-д^е"].apply(lambda s: all(char in allowed_chars for char in s))]

df_filtered

Unnamed: 0,-де,-д^е
0,-ка,-к^а
1,-либо,-л^ибо
2,-нибудь,-ниб^удь
4,-таки,-так^и
5,-то,-т^о
...,...,...
1680436,ящурок,^ящурок
1680437,ящуром,^ящуром
1680438,ящуру,^ящуру
1680439,яэль,я^эль


### Потери небольшие, зато везде есть ударение и нет ненужных символов

In [13]:
df.shape[0] - df_filtered.shape[0]

5880

In [14]:
chars = ''.join(sorted("".join(allowed_chars)))
chars

'-^абвгдежзийклмнопрстуфхцчшщъыьэюя—'

In [15]:
import string
import sys
sys.path.append("/content/character-tokenizer")
from charactertokenizer import CharacterTokenizer

model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)

In [16]:
example = "привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 24, 25, 17, 11, 14, 27, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
words = df_filtered["-де"].values
words_with_stress = df_filtered["-д^е"].values

In [18]:
print(' Original: ', words[0])

print('Tokenized: ', tokenizer.tokenize(words[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(words[0])))

 Original:  -ка
Tokenized:  ['-', 'к', 'а']
Token IDs:  [7, 19, 9]


In [19]:
max_len = 0

for word in words:

    input_ids = tokenizer.encode(word, add_special_tokens=True)

    max_len = max(max_len, len(input_ids))

print('Max word length: ', max_len)

Max word length:  58


In [20]:
MAX_LENGTH = 60
input_ids = []
attention_masks = []
labels = []
for inp, out in zip(words, words_with_stress):
    tokenized = tokenizer.encode_plus(
        inp,
        truncation = True,            # Truncate all words.
        add_special_tokens = True,
        max_length = MAX_LENGTH,      # Pad/Truncate all words.
        padding = 'max_length',       # Pad all words.
        return_attention_mask = True, # Construct attn. masks.
        return_tensors = 'pt',        # Return pytorch tensors.
    )
    
    token_labels = torch.tensor([out.index("^") + 1], dtype=int)
    labels.append(token_labels)
    
    # Add the encoded word to the list.
    input_ids.append(tokenized['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(tokenized['attention_mask'])
    assert token_labels[0]<60, "target: {} invalid".format(target)

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)

print('Original: ', words[0])
print('Original with stress: ', words_with_stress[0])
print('Token IDs:', input_ids[0])
print('Mask: ', attention_masks[0])
print('Label:', labels[0])

Original:  -ка
Original with stress:  -к^а
Token IDs: tensor([ 0,  7, 19,  9,  1,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4])
Mask:  tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Label: tensor(3)


# Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)

In [21]:
from torch.utils.data import TensorDataset, random_split

In [22]:
class NamedTensorDataset(TensorDataset):
    def __init__(self, *tensors, names):
        """
        Initialize the NamedTensorDataset.

        Args:
            *tensors: Tensors to be stored in the dataset.
            names (tuple): A tuple of strings representing the names of the tensors.
        """
        if len(tensors) != len(names):
            raise ValueError("Number of tensors and number of names must be the same.")
        for tensor in tensors:
            if not isinstance(tensor, torch.Tensor):
                raise TypeError("All inputs must be torch.Tensors.")
            if tensors[0].size(0) != tensor.size(0):
                raise ValueError("All tensors must have the same size in the first dimension.")
        self.tensors = tensors
        self.names = names

    def __getitem__(self, index):
        """
        Retrieve a single item from the dataset.

        Args:
            index (int): The index of the item to retrieve.

        Returns:
            dict: A dictionary mapping names to the corresponding tensor values for the given index.
        """
        return {name: tensor[index] for name, tensor in zip(self.names, self.tensors)}

    def __len__(self):
        """
        Returns the length of the dataset.

        Returns:
            int: The number of items in the dataset.
        """
        return self.tensors[0].size(0)

In [23]:


dataset = NamedTensorDataset(input_ids, attention_masks, labels, names=('input_ids', 'attention_masks', 'labels'))

train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(test_size))

837,327 training samples
837,327 validation samples


In [24]:
batch_size = 1024

# Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла). 

In [25]:
from transformers import DebertaV2Config, DebertaV2ForSequenceClassification
import evaluate

config = DebertaV2Config.from_json_file("config.json")
model = DebertaV2ForSequenceClassification(config)

In [26]:
model.cuda()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(41, 256, padding_idx=0)
      (position_embeddings): Embedding(60, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-3): 4 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=256, out_features=256, bias=True)
              (key_proj): Linear(in_features=256, out_features=256, bias=True)
              (value_proj): Linear(in_features=256, out_features=256, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): Layer

In [27]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",          # Directory to save model checkpoints
    overwrite_output_dir=True,           # Overwrite content in the output directory
    eval_strategy="epoch",                # Evaluate after each epoch
    save_strategy="epoch",
    learning_rate=1e-4,                    # Learning rate for training
    per_device_train_batch_size=batch_size, # Batch size per device
    per_device_eval_batch_size=batch_size,  # Batch size per device
    num_train_epochs=77,                   # Total number of epochs
    weight_decay=1e-4,                    # Weight decay for regularization
    save_steps=10000,                    # Save checkpoint every 10000 steps
    save_total_limit=10,                 # Keep only the last 10 checkpoints
    logging_dir="./logs",                # Directory for logs
    logging_steps=10000,                 # Log every 10000 steps
    load_best_model_at_end=True,
    torch_compile=True,
)

The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here.


In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,                          # Model to train
    args=training_args,                   # Training configuration
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=test_dataset,            # Evaluation dataset
    tokenizer=tokenizer,                  # Tokenizer for alignment (optional)
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.829358,0.69177
2,No log,0.674275,0.740037
3,No log,0.616466,0.757649
4,No log,0.571214,0.773046
5,No log,0.550709,0.779365
6,No log,0.519151,0.792757
7,No log,0.490125,0.801892
8,No log,0.477384,0.807121
9,No log,0.466689,0.812059
10,No log,0.460778,0.81442


TrainOutput(global_step=62986, training_loss=0.32351198603099796, metrics={'train_runtime': 22599.5327, 'train_samples_per_second': 2852.899, 'train_steps_per_second': 2.787, 'total_flos': 7.522041536379216e+16, 'train_loss': 0.32351198603099796, 'epoch': 77.0})

In [31]:
# Evaluate model
results = trainer.evaluate()

# Save the trained model and tokenizer
model.save_pretrained("./deberta_stress_model")
tokenizer.save_pretrained("./deberta_stress_model")

print("Training complete. Model saved!")

Training complete. Model saved!


In [32]:
results

{'eval_loss': 0.20526458323001862,
 'eval_accuracy': 0.9287196041689806,
 'eval_runtime': 97.7454,
 'eval_samples_per_second': 8566.408,
 'eval_steps_per_second': 8.369,
 'epoch': 77.0}

### Порог accuracy в 0.89 пройден. Далее наглядная демонстрация работы

In [33]:
from transformers import pipeline

In [34]:
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [45]:
def beauty_pred(pipe, text):
    index = pipe(text)
    pos = int(index[0]["label"][4:]) - 1
    return text[:pos] + '^' + text[pos:]

In [47]:
def beauty_cmp(dataset, pipe, id):
    item = dataset.__getitem__(id)
    pos = item['labels'] - 1
    text = tokenizer.decode(token_ids=item['input_ids'], skip_special_tokens=True)
    print("Как правильно:", text[:pos] + '^' + text[pos:])
    print(" Предсказание:", beauty_pred(pipe, text))

In [48]:
beauty_cmp(test_dataset, pipe, 42)

Как правильно: завздых^ало
 Предсказание: завздыхал^о


In [49]:
beauty_cmp(test_dataset, pipe, 420)

Как правильно: проводник^овую
 Предсказание: проводников^ую


In [50]:
beauty_cmp(test_dataset, pipe, 4200)

Как правильно: пл^енум
 Предсказание: пл^енум


In [51]:
beauty_cmp(test_dataset, pipe, 42000)

Как правильно: б^уквочку
 Предсказание: буквочк^у


In [52]:
beauty_cmp(test_dataset, pipe, 420000)

Как правильно: нар^оем
 Предсказание: нар^оем
