In [1]:
from scripts.custom_dataset import CustomDataset
from scripts.vectorizer import Seq2Seq_Vectorizer
from scripts.tokenizer import SeparatorTokenizer
from scripts.vocabulary import Vocabulary
from scripts.model import Seq2Seq_Model

import numpy as np
import gc
import os
import pandas
import time
import pandas as pd
import regex as re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
TEST_PROPORTION = 0.0
EVAL_PROPORTION = 0.2

TOKENS_TRESHOLD_FREQ = 10

SHUFFLE = True
DROP_LAST = True
EPOCHS = 5
LEARNING_RATE = 0.0001
SAMPLE_PROBABILITY = 0.1 # Вероятность при обучении взять в качестве input сгенерированный токен, а не groung-truth токен

LR_SCHEDULER_FACTOR = 0.5
LR_SCHEDULER_PATIENCE = 2

BATCH_SIZE = 512
SOURCE_EMBEDDING_SIZE = 48
TARGET_EMBEDDING_SIZE = 48
MAX_SOURCE_SEQ_LEN = 100
MAX_TARGET_SEQ_LEN = 114
MAX_GENERATED_SEQ_LEN = 114

RNN_HIDDEN_SIZE = 64
FC_HIDDEN_SIZE = 256

MODEL_SAVE_FILEPATH = 'data/model_params.pt'
DATASET_PATH = 'D:/Files/Datasets/NMT_ru_en'

RANDOM_STATE = 42

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
print(DEVICE)

cuda


In [5]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    """
    Создаёт DataLoader и возвращает батчи в виде словаря.
    Для каждой итерации данные сортируются по длине последовательности
     и переносятся на нужное устройство.
    """
    dataloader = DataLoader(dataset, batch_size, shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        lengths = data_dict['source_len'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict

In [6]:
def normalize_sizes(y_pred, y_true):
    """
    Приводит выход модели и целевые тензоры к совместимым размерам.
    Если вывод 3‑D (B, T, vocab), меняем форму на матрицу (BT, vocab).
    Цели в виде матрицы (B, T) превращаем в вектор (BT).
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.reshape(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.reshape(-1)
    return y_pred, y_true

In [7]:
def compute_accuracy(y_pred, y_true, mask_index):
    """
    Вычисляет точность предсказаний (в %).
    Игнорируются позиции с `mask_index` (т.е. padding‑токены).
    """
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

In [8]:
def sequence_loss(y_pred, y_true, mask_index):
    """
    Кросс‑энтропия для последовательностей.
    `ignore_index` позволяет игнорировать padding‑токены в loss.
    """
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [9]:
def get_tokens_freq(dataframe : pandas.DataFrame) -> tuple[dict, dict]:
    """
    Счётчик частоты токенов по колонкам `source_text` и `target_text`.
    Возвращает два словаря: {token: freq}.
    """
    source_freq = {}
    target_freq = {}
    for i in range(len(dataframe)):
        source_tokens, target_tokens = (dataframe.loc[i, 'source_text'], dataframe.loc[i, 'target_text'])
        for token in source_tokens:
            if token in source_freq:
                source_freq[token] += 1
            else:
                source_freq[token] = 1
        for token in target_tokens:
            if token in target_freq:
                target_freq[token] += 1
            else:
                target_freq[token] = 1
    return source_freq, target_freq

In [10]:
def get_max_tokenized_seq_len(dataframe : pandas.DataFrame) -> tuple[int, int]:
    """
    Находит максимальную длину токенизированных строк в колонках
    `source_text` и `target_text`. Возвращает два целых числа.
    """
    source_max_len = target_max_len = -1
    for idx in range(len(dataframe)):
        source_max_len = max(len(dataframe.loc[idx, 'source_text']), source_max_len)
        target_max_len = max(len(dataframe.loc[idx, 'target_text']), target_max_len)
    return source_max_len, target_max_len

In [11]:
def generate(model, tokenizer, vectorizer, query: str, max_response_tokens:int=50, response_seq_count : int = 1, temperature: float = 1.0, device='cpu') -> str:
    """
    Генерация ответа модели на входную строку `query`.
    Возвращает тензор индексов (batch, seq_len).
    """
    model.to(device)
    model.eval()

    with torch.no_grad():
        # Токенизация и векторизация
        tokens = tokenizer.tokenize(query.lower())
        vec_dict = vectorizer.vectorize(source_tokens=tokens, use_dataset_max_len=False)

        source_vec = torch.tensor(vec_dict['source_vec'], dtype=torch.long).to(device).unsqueeze(0)
        # print(f'source_vec.size() {source_vec.size()}')
        source_len = torch.tensor([vec_dict['source_len']], dtype=torch.long).to(device)
        # print(f'source_len.size() {source_len.size()}')

        # Encoder
        encoder_state, encoder_final_hidden = model.encoder(source_vec, source_len)

        # Decoder
        decoded_states = model.decoder(encoder_state, encoder_final_hidden, forced_batch_size=response_seq_count,\
                                    sample_probability=1.0, output_sequence_size=max_response_tokens, temperature=temperature)
        
        batch_size, seq_size, vocab_size = decoded_states.size()
        decoded_states = F.softmax(decoded_states * temperature, dim=-1)
        print(f'decoded_states.size() {decoded_states.size()}')
        decoded_states = decoded_states.reshape(-1, vocab_size)
        indices = torch.multinomial(decoded_states, 1)
        print(f'indices.size() {indices.size()}')
        indices = indices.reshape(batch_size, seq_size, -1).squeeze(-1)
        return indices

In [12]:
def decode_indices(indices : torch.tensor, vectorizer):
    """
    Преобразует тензор индексов в читаемый текст.
    Останавливает чтение строки при встрече EOS‑токена.
    """
    seq_count, seq_len = (indices.size(0), indices.size(1))
    vocab = vectorizer.target_vocab
    decoded = []
    for seq in range(seq_count):
        string =''
        for idx in range(seq_len):
            index = indices[seq, idx].item()
            if index != vocab.mask_token_index:
                string += vocab.get_token(index) + ' '
            if index == vocab._eos_index:
                break
        decoded.append(string)
    return decoded

In [13]:
def save_model_to_file(model, filepath):
    """
    Сохраняет всю модель (структура + веса) в файл.
    """
    torch.save(model, filepath)

In [14]:
# train_df = pd.read_csv(os.path.join(DATASET_PATH, 'train.csv'))
# test_df = pd.read_csv(os.path.join(DATASET_PATH, 'test.csv'))
# validation_df = pd.read_csv(os.path.join(DATASET_PATH, 'validation.csv'))
# df = pd.concat([train_df, test_df, validation_df], ignore_index=True)
# del train_df, test_df, validation_df
# gc.collect()
# test_valid_len = (TEST_PROPORTION + EVAL_PROPORTION) * len(df)
# test_valid_proportion = TEST_PROPORTION / EVAL_PROPORTION
# valid_len = test_valid_len / (test_valid_proportion + 1)

In [15]:
tokenizer = SeparatorTokenizer()

In [16]:
df = pd.read_csv(os.path.join(DATASET_PATH, 'ru_en_small.csv'), index_col='Unnamed: 0')

df = df.rename(columns={'ru_text' : 'source_text', 'en_text' : 'target_text'})
df['split'] = 'train'
selected_indices = df.sample(int(EVAL_PROPORTION*len(df)), random_state=RANDOM_STATE).index
df.loc[selected_indices, 'split'] = 'validation'

# К нижнему регистру, токенизация и очистка от служебных символов
df['source_text'] = df['source_text'].apply(lambda x: tokenizer.tokenize(x.lower()))
df['target_text'] = df['target_text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [17]:
# Поиск максимальной длины сурс и таргет текста
# source_max_len = target_max_len = -1
# for i in range(len(df)):
#     source_max_len = max(source_max_len, len(df.loc[i, 'source_text']))
#     target_max_len = max(target_max_len, len(df.loc[i, 'target_text']))
# print(source_max_len)
# print(target_max_len)

In [18]:
# source_vocab = Vocabulary()
# target_vocab = Vocabulary()

# source_freq, target_freq = get_tokens_freq(df)

# for key, value in source_freq.items():
#     if value > TOKENS_TRESHOLD_FREQ:
#         source_vocab.add_token(key)

# for key, value in target_freq.items():
#     if value > TOKENS_TRESHOLD_FREQ:
#         target_vocab.add_token(key)


# source_vocab.to_json('data/source_vocab.json')
# target_vocab.to_json('data/target_vocab.json')

In [19]:
source_vocab = Vocabulary.from_json('data/source_vocab.json')
target_vocab = Vocabulary.from_json('data/target_vocab.json')

In [20]:
vectorizer = Seq2Seq_Vectorizer(source_vocab, target_vocab, MAX_SOURCE_SEQ_LEN, MAX_TARGET_SEQ_LEN)
dataset = CustomDataset(df, vectorizer)

In [21]:
source_vocab_size = len(source_vocab)
target_vocab_size = len(target_vocab)
mask_index = target_vocab.mask_token_index

In [22]:
# model = Seq2Seq_Model(source_vocab_size=source_vocab_size, source_embedding_size=SOURCE_EMBEDDING_SIZE, target_vocab_size=target_vocab_size,\
#                       target_embedding_size=TARGET_EMBEDDING_SIZE, encoder_rnn_size=RNN_HIDDEN_SIZE, fc_hidden_size=FC_HIDDEN_SIZE, target_bos_index=target_vocab._bos_index)

In [23]:
model = torch.load(MODEL_SAVE_FILEPATH, weights_only=False)

In [24]:
model = model.to(DEVICE)

optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=LR_SCHEDULER_FACTOR, patience=LR_SCHEDULER_PATIENCE)

In [25]:
try:
    for epoch in range(EPOCHS):
        sample_probability = SAMPLE_PROBABILITY
        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_dataframe_split('train')
        batch_generator = generate_batches(dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, drop_last=DROP_LAST, device=DEVICE)
        train_running_loss = 0.0
        train_running_acc = 0.0
        epoch_err = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):

            optimizer.zero_grad()

            # compute the output
            y_pred = model(batch_dict['source_vec'],
                           batch_dict['source_len'],
                           batch_dict['target_x_vec'],
                           sample_probability=sample_probability)

            # compute the loss
            loss = sequence_loss(y_pred, batch_dict['target_y_vec'], mask_index=mask_index)

            # use loss to produce gradients
            loss.backward()

            # use optimizer to take gradient step
            optimizer.step()

            # compute the running loss and running accuracy
            train_running_loss += (loss.item() - train_running_loss) / (batch_index + 1)
            epoch_err += loss.item()

            acc_t = compute_accuracy(y_pred, batch_dict['target_y_vec'], mask_index)
            train_running_acc += (acc_t - train_running_acc) / (batch_index + 1)

        print('-'*40)
        print(f'epoch {epoch}')
        print(f'train_epoch_error {epoch_err}')
        print(f'train loss {train_running_loss}   ,   train accuracy {train_running_acc}')

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_dataframe_split('validation')
        batch_generator = generate_batches(dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, drop_last=DROP_LAST, device=DEVICE)
        valid_running_loss = 0.0
        valid_running_acc = 0.0
        epoch_err = 0.0
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(batch_dict['source_vec'],
                           batch_dict['source_len'],
                           batch_dict['target_x_vec'],
                           sample_probability=sample_probability)

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['target_y_vec'], mask_index)

            # compute the running loss and accuracy
            valid_running_loss += (loss.item() - valid_running_loss) / (batch_index + 1)
            epoch_err += loss.item()

            acc_t = compute_accuracy(y_pred, batch_dict['target_y_vec'], mask_index)
            valid_running_acc += (acc_t - valid_running_acc) / (batch_index + 1)

        print(f'validation_epoch_error {epoch_err}')
        print(f'validation loss {valid_running_loss}   ,   validation accuracy {valid_running_acc}')
        
except KeyboardInterrupt:
    print("Exiting loop")

----------------------------------------
epoch 0
train_epoch_error 495.5587911605835
train loss 0.8740013953449445   ,   train accuracy 79.71496327455522
validation_epoch_error 147.41282963752747
validation loss 1.0454810612590595   ,   validation accuracy 77.77468260208857
----------------------------------------
epoch 1
train_epoch_error 490.12117552757263
train loss 0.8644112443167068   ,   train accuracy 79.89346674594118
validation_epoch_error 148.01848256587982
validation loss 1.0497764720984382   ,   validation accuracy 77.74995858589068
----------------------------------------
epoch 2
train_epoch_error 487.2110313177109
train loss 0.859278714846051   ,   train accuracy 80.04840063585054
validation_epoch_error 148.83492928743362
validation loss 1.0555668743789615   ,   validation accuracy 77.71252186905298
----------------------------------------
epoch 3
train_epoch_error 484.12215983867645
train loss 0.8538309697331153   ,   train accuracy 80.11523998376522
validation_epoch_err

In [30]:
query = 'Я собираюсь поехать в отпуск'

In [31]:
indices = generate(model, tokenizer, vectorizer, query, max_response_tokens=MAX_GENERATED_SEQ_LEN, response_seq_count=1, device=DEVICE)
response = decode_indices(indices, vectorizer)

decoded_states.size() torch.Size([1, 114, 4882])
indices.size() torch.Size([114, 1])


In [32]:
response

["i ' m going to go in vacation . <EOS> "]

In [29]:
save_model_to_file(model, MODEL_SAVE_FILEPATH)