# Переводчик с английского на русский

### 1. Переписать загрузку данных с python функций на Dataset и Dataloader и применить сеть с attention

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data_path_ru = '/content/drive/Othercomputers/Мое устройство Компьютер/Google.Disk/Colab Notebooks/data/rus-eng/rus.txt'

In [3]:
class LipsDataset(torch.utils.data.Dataset):
    
    def __init__(self, data_path, num_samples = 300):
        # loading data
        self.input_texts = []
        self.target_texts = []

        input_vocab = set()
        output_vocab = set()

        with open(data_path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')

        for line in lines[: min(num_samples, len(lines) - 1)]:
            input_text, target_text, _ = line.split('\t')
            target_text = '\t' + target_text + '\n'
            self.input_texts.append(input_text)
            for word in input_text.split():
                input_vocab.add(word.strip())
            self.target_texts.append(target_text)
            for word in target_text.split():
                output_vocab.add(word.strip())
            
        self.input_vocab2index = {word: i+2 for i, word in enumerate(input_vocab)}
        self.output_vocab2index = {word: i+2 for i, word in enumerate(output_vocab)}
    
    def __getitem__(self, index):
        return self.tensorsFromSent(self.input_texts[index], self.target_texts[index])

    def indexesFromSentence(self, sentence, vocab):
        return [vocab.get(word.strip(), 0) for word in sentence.split(' ')]

    def tensorFromSentence(self, sentence, vocab):
        indexes = self.indexesFromSentence(sentence, vocab)
        indexes.append(1)
        return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


    def tensorsFromSent(self, input_sentences, output_sentences):
        input_tensor = self.tensorFromSentence(input_sentences, self.input_vocab2index)
        target_tensor = self.tensorFromSentence(output_sentences, self.output_vocab2index)
        return (input_tensor, target_tensor)

    def __len__(self):
        return len(self.input_texts)

    def len_target(self):
        return len(self.output_vocab2index)

    def len_input(self):
        return len(self.input_vocab2index)

In [4]:
# Собираем из текстов токены
num_samples = 300
dataset = LipsDataset(data_path_ru, num_samples)

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        #output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [6]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]])

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == 1:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [7]:
encoder = EncoderRNN(dataset.len_input()+2, 30)
attn_decoder1 = AttnDecoderRNN(30, dataset.len_target()+2, dropout_p=0.1)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(attn_decoder1.parameters(), lr=0.01)

criterion = nn.NLLLoss()

n_epochs = 200000 # количество эпох
n = n_epochs//10
np.random.seed(1000)
training_pairs = np.random.randint(0, num_samples, size=n_epochs)
loss_accumed = 0

for epoch in tqdm(range(n_epochs)):

    input_tensor, target_tensor = dataset[training_pairs[epoch]]
    loss = train(input_tensor, target_tensor, encoder, attn_decoder1, encoder_optimizer, decoder_optimizer, criterion)
    loss_accumed += loss

    if epoch < 1:
        print(f'\nepoch = {epoch}    loss = {loss_accumed:.3f}')
        loss_accumed = 0
        continue

    if epoch % n == 0:
        print(f'\nepoch = {epoch}    loss = {loss_accumed/n:.3f}')
        loss_accumed = 0

print(f'\nepoch = {n_epochs}    loss = {loss_accumed/n:.3f}')

  0%|          | 22/200000 [00:00<38:19, 86.97it/s]  


epoch = 0    loss = 5.855


 10%|█         | 20047/200000 [01:25<11:27, 261.71it/s]


epoch = 20000    loss = 2.143


 20%|██        | 40033/200000 [02:43<10:05, 264.10it/s]


epoch = 40000    loss = 0.931


 30%|███       | 60041/200000 [04:02<09:03, 257.69it/s]


epoch = 60000    loss = 0.721


 40%|████      | 80025/200000 [05:21<07:59, 250.35it/s]


epoch = 80000    loss = 0.675


 50%|█████     | 100029/200000 [06:40<06:43, 247.75it/s]


epoch = 100000    loss = 0.655


 60%|██████    | 120030/200000 [07:59<05:08, 258.85it/s]


epoch = 120000    loss = 0.643


 70%|███████   | 140030/200000 [09:18<03:58, 251.33it/s]


epoch = 140000    loss = 0.640


 80%|████████  | 160025/200000 [10:36<02:33, 260.86it/s]


epoch = 160000    loss = 0.635


 90%|█████████ | 180050/200000 [11:54<01:16, 259.68it/s]


epoch = 180000    loss = 0.635


100%|██████████| 200000/200000 [13:12<00:00, 252.47it/s]


epoch = 200000    loss = 0.631





### Проверим результат:

In [11]:
output_data = pd.DataFrame(list(dataset.output_vocab2index.items()), columns=['word', 'token'])
input_data = pd.DataFrame(list(dataset.input_vocab2index.items()), columns=['word', 'token'])

In [12]:
def reply(input_tensor, encoder, decoder, max_length=10):
    encoder_hidden = encoder.initHidden()

    input_length = input_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]])

    decoder_hidden = encoder_hidden

    decoder_output, _, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

    return decoder_output

In [19]:
training_pairs = np.random.randint(0, num_samples, size=n_epochs)
for i in range(10):
    input_tensor, target_tensor = dataset[training_pairs[i]]
    output = reply(input_tensor, encoder, attn_decoder1)
    _, topi = output.topk(1)
    print(f"input = {input_data.loc[input_data['token']==input_tensor[0][0].item(), 'word'].item()}")
    print(f"output = {output_data.loc[output_data['token']==topi.squeeze().detach().item(), 'word'].item()}")
    print(f"target = {output_data.loc[output_data['token']==target_tensor[0][0].item(), 'word'].item()}\n")

input = I
output = Понятно.
target = Вижу.

input = I'm
output = Я
target = Я

input = Come
output = Зайдите!
target = Заходи.

input = Smile.
output = Улыбайтесь!
target = Улыбнись.

input = Fold
output = Сложи
target = Сложите

input = Hit
output = Бейте
target = Ударь

input = Eat
output = Доедай.
target = Доедай.

input = Jump.
output = Прыгайте!
target = Прыгай!

input = He
output = Он
target = Он

input = Hold
output = Замри!
target = Замри!

