# Тренировка модели машинного обучения

Запускал ноутбук удаленно на Kaggle Server

In [1]:
!nvidia-smi

Sun Sep 21 17:57:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Imports

In [32]:
import os
import random
import torch
import numpy as np
import pandas as pd
from typing import List
from datasets import Dataset
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    LogitsProcessor,
    LogitsProcessorList,
)
import getpass
os.environ["HF_TOKEN"] =  getpass.getpass("Enter your HF_TOKEN: ")

In [46]:
window_length = 64
stride = 48
seed = 2025
val_size = 0.2
max_examples = 100000

In [10]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed)

## Dataset Creation

In [11]:
def process_text_with_spaces(text):
    method_choice = random.random()
    
    if method_choice < 0.1: # 10% - Remove all spaces
        return text.replace(" ", "")
    
    elif method_choice < 0.2: # 10% - Add space between each character
        return " ".join(text)
    
    else:
        result_chars = []
        
        i = 0
        n = len(text)
    
        while i < n:
            current_char = text[i]
            if current_char == ' ':
                # It's a space. 20% chance to delete it (skip it).
                if random.random() < 0.2:
                    i += 1  # Just move to the next char, don't add this space.
                    continue
                else:
                    result_chars.append(current_char)  # Keep the space.
                    i += 1
            else:
                # It's a non-space character. Always add it.
                result_chars.append(current_char)
                # Check if there is a next character and it's also non-space
                if i + 1 < n and text[i+1] != ' ':
                    # 10% chance to insert an extra space after this character.
                    if random.random() < 0.1:
                        result_chars.append(' ')
                i += 1

    return ''.join(result_chars)

In [12]:
training_data_filepath = '/kaggle/input/avito-ds-dataset/training_data_russian_literature.txt'

In [13]:
with open(training_data_filepath, 'r') as file:
    text = file.read().strip()

In [14]:
src_windows = []
tgt_windows = []

for i in tqdm(range(0, len(text), stride)):
    src_window = text[i:i+window_length].strip()
    # Generate the corrupted version of that window
    tgt_window = process_text_with_spaces(src_window).strip()
    
    if len(src_window) > 0 and len(tgt_window) > 0:
        src_windows.append(tgt_window)
        tgt_windows.append(src_window)


dataset = Dataset.from_dict({
    "src_text": src_windows,
    "tgt_text": tgt_windows,
})

  0%|          | 0/768526 [00:00<?, ?it/s]

In [15]:
idx = 105
print(f"src='{dataset['src_text'][idx]}'")
print(f"tgt='{dataset['tgt_text'][idx]}'")

src='ви и подпа пертью; ну, э то кусается .В тр ет ьем ра зряд е за э тотр'
tgt='ви и под папертью; ну, это кусается. В третьем разряде за этот р'


In [16]:
dataset

Dataset({
    features: ['src_text', 'tgt_text'],
    num_rows: 768526
})

In [17]:
dataset_size = len(dataset)
sample_size = min(dataset_size, max_examples)
random_indices = random.sample(range(dataset_size), sample_size)
dataset_limited = dataset.select(random_indices)
dataset_limited

Dataset({
    features: ['src_text', 'tgt_text'],
    num_rows: 100000
})

In [18]:
dataset_split = dataset_limited.train_test_split(test_size=val_size, seed=seed)
train_dataset = dataset_split["train"]
val_dataset = dataset_split["test"]

## Model

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
model_name = "zarus03/byt5-wsc"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [21]:
model

T5ForConditionalGeneration(
  (shared): Embedding(384, 1472)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1472)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1472, out_features=384, bias=False)
              (k): Linear(in_features=1472, out_features=384, bias=False)
              (v): Linear(in_features=1472, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=1472, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1472, out_features=3584, bias=False)
              (wi_1): Linear(in_features=1472, out_features=3584, bias=False)
              (w

In [22]:
len(tokenizer.get_vocab())

384

In [23]:
tokenizer.get_vocab()

{'<pad>': 0,
 '</s>': 1,
 '<unk>': 2,
 '\x00': 3,
 '\x01': 4,
 '\x02': 5,
 '\x03': 6,
 '\x04': 7,
 '\x05': 8,
 '\x06': 9,
 '\x07': 10,
 '\x08': 11,
 '\t': 12,
 '\n': 13,
 '\x0b': 14,
 '\x0c': 15,
 '\r': 16,
 '\x0e': 17,
 '\x0f': 18,
 '\x10': 19,
 '\x11': 20,
 '\x12': 21,
 '\x13': 22,
 '\x14': 23,
 '\x15': 24,
 '\x16': 25,
 '\x17': 26,
 '\x18': 27,
 '\x19': 28,
 '\x1a': 29,
 '\x1b': 30,
 '\x1c': 31,
 '\x1d': 32,
 '\x1e': 33,
 '\x1f': 34,
 ' ': 35,
 '!': 36,
 '"': 37,
 '#': 38,
 '$': 39,
 '%': 40,
 '&': 41,
 "'": 42,
 '(': 43,
 ')': 44,
 '*': 45,
 '+': 46,
 ',': 47,
 '-': 48,
 '.': 49,
 '/': 50,
 '0': 51,
 '1': 52,
 '2': 53,
 '3': 54,
 '4': 55,
 '5': 56,
 '6': 57,
 '7': 58,
 '8': 59,
 '9': 60,
 ':': 61,
 ';': 62,
 '<': 63,
 '=': 64,
 '>': 65,
 '?': 66,
 '@': 67,
 'A': 68,
 'B': 69,
 'C': 70,
 'D': 71,
 'E': 72,
 'F': 73,
 'G': 74,
 'H': 75,
 'I': 76,
 'J': 77,
 'K': 78,
 'L': 79,
 'M': 80,
 'N': 81,
 'O': 82,
 'P': 83,
 'Q': 84,
 'R': 85,
 'S': 86,
 'T': 87,
 'U': 88,
 'V': 89,
 'W': 90,

In [24]:
custom_str = 'привет'
print(f"{len(custom_str)=}")
tokenized_seq = tokenizer.encode(custom_str)
print(f"{len(tokenized_seq)=}")
print(tokenized_seq)

len(custom_str)=6
len(tokenized_seq)=13
[211, 194, 212, 131, 211, 187, 211, 181, 211, 184, 212, 133, 1]


In [25]:
def preprocess(batch):
    model_inputs = tokenizer(
        batch["src_text"],
        max_length=window_length,
        padding="max_length",
        truncation=True,
    )

    model_inputs["labels"] = tokenizer(
        batch["tgt_text"],
        max_length=window_length,
        padding="max_length",
        truncation=True,
    )["input_ids"]
    
    return model_inputs

In [26]:
tokenized_train = train_dataset.map(
    preprocess, batched=True, remove_columns=["src_text", "tgt_text"])

tokenized_val = val_dataset.map(
    preprocess, batched=True, remove_columns=["src_text", "tgt_text"])

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

## Constrained decoding implementation

In [27]:
# ---------------------------
# 4) Constrained decoding: custom LogitsProcessor
# ---------------------------
# We'll implement a LogitsProcessor that, for each batch element at each generation step,
# allows only two token ids:
#   - space_id
#   - the id for the next byte of the encoder input sequence (ignoring encoder padding)
# Implementation details:
# - The processor is initialized with encoder_input_ids (unpadded list of bytes) for each batch item.
# - At each call, given input_ids (generated tokens so far), we compute pos = number of generated tokens that matched encoder bytes in order.
#   - Spaces do not advance pos. When pos reaches encoder length, generation stops (or only spaces allowed).
# - Then we set logits for disallowed tokens to -inf.

class EDConstrainedLogitsProcessor(LogitsProcessor):
    def __init__(self, encoder_inputs: List[List[int]], pad_token_id: int, space_token_id: int):
        """
        encoder_inputs: list of lists (for each batch element) containing encoder input ids 
        (without special padding or with padding - we'll trim)
        """
        self.encoder_inputs = [self._trim(inp, pad_token_id) for inp in encoder_inputs]
        self.pad = pad_token_id
        self.space_id = space_token_id

    def _trim(self, arr, pad_id):
        # remove trailing pad tokens if present
        trimmed = [int(x) for x in arr if int(x) != pad_id]
        return trimmed

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        """
        input_ids: (batch_size, cur_len)
        scores: (batch_size, vocab_size) (logits for next token)
        Return masked scores.
        """
        bs = input_ids.size(0)
        vocab_size = scores.size(-1)
        device = scores.device
        big_neg = -1e9

        masked = scores.clone()

        for i in range(bs):
            enc = self.encoder_inputs[i]
            # compute pos: number of previous generated tokens that match enc in order
            # Skip BOS if present (T5 generation may start with decoder_start_token_id) — we only look at tokens after the decoder start.
            gen = input_ids[i].tolist()
            # compute position in encoder: count how many times we consumed a source token
            pos = 0
            enc_idx = 0
            # iterate generated tokens (skip initial special tokens like bos if any)
            # We assume that whenever a generated token equals enc[enc_idx], it consumes that input byte (pos++)
            for tok in gen:
                if enc_idx < len(enc) and tok == enc[enc_idx]:
                    enc_idx += 1
            pos = enc_idx

            allowed = set()
            allowed.add(self.space_id)
            if pos < len(enc):
                allowed.add(enc[pos])

            # if pos >= len(enc): optionally allow only spaces or EOS, but here we will allow only EOS and space. We'll allow EOS as well.
            if pos >= len(enc):
                # optionally allow eos (if defined), but we don't know eos here. We'll allow space only.
                allowed = {self.space_id}
            # mask others
            mask = torch.full((vocab_size,), big_neg, device=device)
            allowed_list = list(allowed)
            mask[allowed_list] = 0.0
            masked[i] = scores[i] + mask

        return masked

def build_logits_processor_for_batch(batch_input_ids, pad_token_id, space_token_id):
    # batch_input_ids: torch tensor (batch, seq_len)
    encoder_inputs = [row.tolist() for row in batch_input_ids]
    ed = EDConstrainedLogitsProcessor(
        encoder_inputs=encoder_inputs,
        pad_token_id=pad_token_id,
        space_token_id=space_token_id
    )
    return LogitsProcessorList([ed])

In [28]:
# ---------------------------
# 7) Custom generate wrapper for constrained decoding
# ---------------------------
def generate_constrained(batch, max_length):
    """
    batch: dict with 'input_ids' and 'attention_mask' (tensors, batched)
    Return: list of decoded strings
    """
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    pad_id = tokenizer.pad_token_id
    space_id = tokenizer.encode(" ", add_special_tokens=False)[0]

    logits_processor = build_logits_processor_for_batch(
        input_ids.detach().cpu(),
        pad_token_id=pad_id,
        space_token_id=space_id
    )

    generated = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        logits_processor=logits_processor,
        return_dict_in_generate=True,
        output_scores=False,
        do_sample=False,
    )

    return generated.sequences

In [29]:
# ---------------------------
# 8) Seq2SeqTrainer but override prediction_step to use constrained decoding at eval time
# ---------------------------
class ConstrainedSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, *args, **kwargs):
        self.max_length = kwargs.pop('max_length', None)
        super().__init__(*args, **kwargs)

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """
        We override to use constrained generation at prediction time.
        """
        # For training or when not predicting with generate, fallback
        if not self.args.predict_with_generate or prediction_loss_only:
            return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

        # otherwise, compute loss and also produce constrained predictions
        # move inputs to device
        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)

        with torch.no_grad():
            loss = self.compute_loss(model, inputs, return_outputs=False)

        generated_tokens = generate_constrained(inputs, self.max_length)

        if generated_tokens.shape[-1] < self.max_length:
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, self.max_length)

        if has_labels:
            labels = inputs["labels"]
            if labels.shape[-1] < self.max_length:
                labels = self._pad_tensors_to_max_len(labels, self.max_length)
        else:
            labels = None

        return (loss, generated_tokens, labels)

## Metrics

In [30]:
def get_space_indices(text: str):
    """Return indices of spaces in the text"""
    return {i for i, ch in enumerate(text) if ch.isspace()}


def calculate_exact_match_accuracy(preds: list, labels: list) -> float:
    """Calculate exact string match accuracy"""
    correct = sum(p == l for p, l in zip(preds, labels))
    return correct / len(preds)


def calculate_f1_score(preds: list, labels: list) -> dict:
    """Calculate F1 score and related metrics for space prediction"""
    all_precisions, all_recalls, all_f1s = [], [], []

    for pred, label in zip(preds, labels):
        pred_spaces = get_space_indices(pred)
        ref_spaces = get_space_indices(label)

        tp = len(pred_spaces & ref_spaces)
        fp = len(pred_spaces - ref_spaces)
        fn = len(ref_spaces - pred_spaces)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)
    
    return np.mean(all_precisions), np.mean(all_recalls), np.mean(all_f1s)

    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds[0]

    # Convert logits to predicted token IDs
    preds = np.argmax(preds, axis=-1)

    # preds is already numpy array (batch_size, seq_len)
    # decode predictions
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Exact string match accuracy
    acc = calculate_exact_match_accuracy(preds, labels)
    p, r, f1 = calculate_f1_score(preds, labels)

    return {
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1,
    }

## Train model

In [43]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./byt5-ed",
    eval_strategy="steps",
    eval_steps=2000,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=2,
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=10,
    weight_decay=0.01,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

trainer = ConstrainedSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    max_length=window_length,
)
print("Starting training...")
trainer.train()

print("Training finished. Saving model...")
trainer.save_model(training_args.output_dir)

print(f"Model saved {training_args.output_dir}")

  super().__init__(*args, **kwargs)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Starting training...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
2000,0.0632,0.053305,0.5586,0.9379,0.950373,0.9422
4000,0.0441,0.05268,0.56575,0.93755,0.942311,0.93808
6000,0.053,0.052775,0.5761,0.943563,0.949325,0.94463
8000,0.0419,0.054038,0.5849,0.951381,0.958991,0.953393


KeyboardInterrupt: 

## Load checkpoint and predict

In [47]:
def correct_spaces(text: str) -> str:
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=window_length
    ).to(device)
    
    return [tokenizer.decode(seq, skip_special_tokens=True).strip()
            for seq in generate_constrained(inputs, window_length)]

In [8]:
model_name = "zarus03/byt5-wsc"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/797 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

NameError: name 'device' is not defined

In [48]:
correct_spaces('работавМосквеудаленно')[0]

'работа в Москве удаленно'

In [49]:
test = pd.read_csv('/kaggle/input/avito-ds-dataset/test.csv')
test.head()

Unnamed: 0,id,text_no_spaces
0,0,куплюайфон14про
1,1,ищудомвПодмосковье
2,2,сдаюквартирусмебельюитехникой
3,3,новыйдивандоставканедорого
4,4,отдамдаромкошку


In [50]:
test['text_with_spaces'] = [correct_spaces(text)[0] 
                            for text in tqdm(test['text_no_spaces'], desc='correcting')]

correcting:   0%|          | 0/1005 [00:00<?, ?it/s]

In [51]:
test.head(20)

Unnamed: 0,id,text_no_spaces,text_with_spaces
0,0,куплюайфон14про,куплю айфон 14 про
1,1,ищудомвПодмосковье,ищу дом в Подмосковье
2,2,сдаюквартирусмебельюитехникой,сдаю квартиру с мебелью и техникой
3,3,новыйдивандоставканедорого,новый диван доставка недорого
4,4,отдамдаромкошку,отдам даром кошку
5,5,работавМосквеудаленно,работа в Москве удаленно
6,6,куплютелевизорPhilips,куплю телев изор Philips
7,7,ищугрузчиковдляпереезда,ищу грузчиков для переезда
8,8,ремонтквартирподключ,ремонт квартир подключ
9,9,куплюноутбукHP,куплюно утбук HP


In [52]:
test.to_csv('/kaggle/working/pred.csv')