In [2]:
import os
import random
import pandas as pd
import torch
from torch.utils.data import Dataset as TorchDataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, IntervalStrategy
import nltk
from tqdm import tqdm



In [3]:
nltk.download('punkt', quiet=True)

True

In [None]:
XLA_AVAILABLE = False
try:
    import torch_xla.core.xla_model as xm
    XLA_AVAILABLE = True
    print("TPU/XLA обнаружен. Обучение будет использовать XLA-бэкэнд.")
except ImportError:
    if torch.cuda.is_available():
        DEVICE = torch.device("cuda")
        print(f"CUDA обнаружена. Используемое устройство: {DEVICE}")
    else:
        DEVICE = torch.device("cpu")
        print(f"CUDA не найдена. Используемое устройство: {DEVICE}")

TPU/XLA обнаружен. Обучение будет использовать XLA-бэкэнд.


In [5]:
class SpellingCorrectionDataset(TorchDataset):
    """Класс Pytorch Dataset для работы с Pandas DataFrame."""
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        self.input_texts = self.data['input_text'].tolist()
        self.target_texts = self.data['target_text'].tolist()

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        input_text = self.input_texts[index]
        target_text = self.target_texts[index]

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            text_target=target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

In [6]:
def calculate_cer(reference, hypothesis):
    reference = reference.replace(' ', '')
    hypothesis = hypothesis.replace(' ', '')
    if len(reference) == 0:
        return 0.0
    return nltk.edit_distance(reference, hypothesis) / len(reference)

def correct_word(input_word_only, current_model, current_tokenizer):
    prefixed_text = 'fix spelling: ' + input_word_only
    inputs = current_tokenizer(prefixed_text, return_tensors="pt", max_length=64, truncation=True, padding="max_length")

    model_device = current_model.device
    input_ids = inputs.input_ids.to(model_device)
    attention_mask = inputs.attention_mask.to(model_device)
    if XLA_AVAILABLE and model_device.type == 'xla':
        with torch.no_grad():
            outputs = current_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=64,
                num_beams=4,
                early_stopping=True,
            ).cpu()
    else:
        with torch.no_grad():
            outputs = current_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=64,
                num_beams=4,
                early_stopping=True
            )

    return current_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

In [7]:
try:
    FULL_TRAIN_DF = pd.read_csv("./train_words.csv")
    FULL_VAL_DF = pd.read_csv("./val_words.csv")
    FULL_TEST_DF = pd.read_csv("./test_words.csv")

    print(f"Полные данные загружены: Train={len(FULL_TRAIN_DF)}, Val={len(FULL_VAL_DF)}, Test={len(FULL_TEST_DF)}")
except FileNotFoundError:
    print("Ошибка: CSV файлы датасета не найдены.")
    exit()


Полные данные загружены: Train=38856, Val=4857, Test=4857


In [8]:
TRAIN_DF = FULL_TRAIN_DF.sample(n=len(FULL_TRAIN_DF), random_state=42)
VAL_DF = FULL_VAL_DF.sample(n=len(FULL_VAL_DF), random_state=42)
TEST_DF = FULL_TEST_DF.sample(n=len(FULL_TEST_DF), random_state=42)

In [9]:
model_name = "cointegrated/rut5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

if not XLA_AVAILABLE:
    model.to(DEVICE)

train_dataset = SpellingCorrectionDataset(TRAIN_DF, tokenizer)
val_dataset = SpellingCorrectionDataset(VAL_DF, tokenizer)
EST_DF = FULL_TEST_DF.sample(n=len(FULL_TEST_DF)//10, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/640k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/259M [00:00<?, ?B/s]

In [None]:
NUM_EPOCHS_TEST = 3

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,

    optim="adamw_torch",

    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=NUM_EPOCHS_TEST,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [11]:
print(f"Обучение в {NUM_EPOCHS_TEST} эпох")
trainer.train()

output_dir = "./"

model.save_pretrained(output_dir, safe_serialization=False)
tokenizer.save_pretrained(output_dir)

Обучение в 3 эпох




Epoch,Training Loss,Validation Loss
1,0.1123,0.089227
2,0.0928,0.073698
3,0.0775,0.069653




('./tokenizer_config.json',
 './special_tokens_map.json',
 './spiece.model',
 './added_tokens.json')

In [12]:
total_cer = 0
TEST_DF = FULL_TEST_DF.sample(n=len(FULL_TEST_DF)//3, random_state=42)
N = len(TEST_DF)

test_data_for_eval = TEST_DF.to_dict('records')

for row in tqdm(test_data_for_eval, desc="Тестирование"):
    target = row['target_text']
    input_word_only = row['input_text'].replace('fix spelling: ', '')
    predicted = correct_word(input_word_only, model, tokenizer)
    current_cer = calculate_cer(target, predicted)
    total_cer += current_cer

final_cer = total_cer / N
print(f"CER НА ТЕСТОВОМ ДАТАСЕТЕ: {final_cer:.4f}")

Тестирование: 100%|██████████| 1619/1619 [11:29<00:00,  2.35it/s]

CER НА ТЕСТОВОМ ДАТАСЕТЕ: 0.1810



