# 3. Решить задачу машинного перевода
Задание:
- Формируем датасет с исходного языка на целевой (код прописать в классе)
- Строим архитектуру нейронной сети 
- Обучаем 
- Проверить качество с помощью метрики BLEU

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import sacrebleu
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

Загружаем данные (данные решил взять с сайта https://www.manythings.org/anki/, надеюсь это был законный мув :) )

In [2]:
def load_dataset(path, num_samples=10000):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()
    pairs = [line.strip().split("\t")[:2] for line in lines[:num_samples]]
    return pairs

dataset = load_dataset("rus-eng.txt", num_samples=10000)

Токенизация 

In [3]:
def tokenize(text):
    return text.lower().split()

Создаем словари

In [4]:
class Vocabulary:
    def __init__(self):
        self.word2index = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.index2word = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.freq = {}

    def add_sentence(self, sentence):
        for word in sentence:
            if word not in self.freq:
                self.freq[word] = 1
            else:
                self.freq[word] += 1

    def build_vocab(self, min_freq=2):
        idx = 4
        for word, freq in self.freq.items():
            if freq >= min_freq:
                self.word2index[word] = idx
                self.index2word[idx] = word
                idx += 1

    def numericalize(self, sentence):
        return [self.word2index.get(word, self.word2index["<unk>"]) for word in sentence]

ru_vocab = Vocabulary()
en_vocab = Vocabulary()

for ru, en in dataset:
    ru_vocab.add_sentence(tokenize(ru))
    en_vocab.add_sentence(tokenize(en))

ru_vocab.build_vocab()
en_vocab.build_vocab()

Создаем датасет

In [5]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, trg_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, trg = self.data[idx]
        src_tokens = self.src_vocab.numericalize(tokenize(src)) + [self.src_vocab.word2index["<eos>"]]
        trg_tokens = [self.trg_vocab.word2index["<sos>"]] + self.trg_vocab.numericalize(tokenize(trg)) + [self.trg_vocab.word2index["<eos>"]]
        return torch.tensor(src_tokens), torch.tensor(trg_tokens)

dataset = TranslationDataset(dataset, ru_vocab, en_vocab)

collate fn для динамических данных

In [6]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=ru_vocab.word2index["<pad>"], batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=en_vocab.word2index["<pad>"], batch_first=True)
    return src_batch, trg_batch

train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

Модель

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        hidden, cell = self.encoder(src)
        batch_size, trg_len = trg.shape
        outputs = torch.zeros(batch_size, trg_len, len(en_vocab.word2index))

        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            best_guess = output.argmax(1)
            input = trg[:, t] if random.random() < teacher_forcing_ratio else best_guess

        return outputs

Обучение

In [8]:
input_dim = len(ru_vocab.word2index)
output_dim = len(en_vocab.word2index)
emb_dim = 256
hidden_dim = 512
num_layers = 2

encoder = Encoder(input_dim, emb_dim, hidden_dim, num_layers)
decoder = Decoder(output_dim, emb_dim, hidden_dim, num_layers)
model = Seq2Seq(encoder, decoder)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=en_vocab.word2index["<pad>"])

def train(model, loader, optimizer, criterion, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for src, trg in loader:
            optimizer.zero_grad()
            output = model(src, trg)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

train(model, train_loader, optimizer, criterion, num_epochs=10)

Epoch 1, Loss: 1190.5804
Epoch 2, Loss: 895.7237
Epoch 3, Loss: 729.8688
Epoch 4, Loss: 594.3665
Epoch 5, Loss: 484.0489
Epoch 6, Loss: 396.5794
Epoch 7, Loss: 326.8826
Epoch 8, Loss: 279.7658
Epoch 9, Loss: 248.8953
Epoch 10, Loss: 231.3638


BLEU

In [None]:
def evaluate_bleu(model, loader):
    model.eval()
    actual, predicted = [], []

    with torch.no_grad():
        for src, trg in loader:
            hidden, cell = model.encoder(src)
            batch_size = src.shape[0]
            input = torch.tensor([en_vocab.word2index["<sos>"]]).repeat(batch_size).to(src.device)

            translated_sentences = [[] for _ in range(batch_size)]

            for _ in range(50): 
                output, hidden, cell = model.decoder(input, hidden, cell)
                best_guess = output.argmax(1)  
                
                for i in range(batch_size):
                    translated_sentences[i].append(best_guess[i].item())  
                
                input = best_guess  

            
            for i in range(batch_size):
                actual_sentence = " ".join([en_vocab.index2word[idx.item()] for idx in trg[i] if idx.item() not in {en_vocab.word2index["<pad>"], en_vocab.word2index["<sos>"], en_vocab.word2index["<eos>"]}])
                predicted_sentence = " ".join([en_vocab.index2word[idx] for idx in translated_sentences[i] if idx not in {en_vocab.word2index["<pad>"], en_vocab.word2index["<sos>"], en_vocab.word2index["<eos>"]}])
                
                actual.append([actual_sentence])  
                predicted.append(predicted_sentence)

    bleu_score = sacrebleu.corpus_bleu(predicted, actual)
    print(f"BLEU Score: {bleu_score.score:.2f}")

evaluate_bleu(model, train_loader)


BLEU Score: 100.00


## Выводы:

Loss равномерно падал на все процессе обучения, а BLEU показал 100%... Всё получилось :)