In [None]:
#Переписать загрузку данных с python функций на Dataset и Dataloader и применить сеть с attention

In [127]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from string import punctuation
import torch.nn.functional as F

In [141]:
#batch_size = 64
epochs = 30
latent_dim = 256
num_samples = 10000
ex = set(punctuation)

In [142]:
with open('data/rus-eng/rus.txt',mode='r', encoding='utf-8') as file:
    lines = file.read().split('\n')
len(lines)        
    

392306

In [143]:
input_texts = []
target_texts = []
input_vocab = set()
output_vocab = set()

for line in lines[:num_samples]:
    input_txt, target_txt, __ = line.split('\t')
    target_txt = '\t' + target_txt + '\n'
    input_texts.append(input_txt)
    target_texts.append(target_txt)
    for word in input_txt.split():
#         word = str(word).lower()
#         word = ''.join(i for i in lines if i not in ex)
        input_vocab.add(word.strip())
    for word in target_txt.split():
#         word = str(word).lower()
#         word = ''.join(i for i in lines if i not in ex)
        output_vocab.add(word.strip())
   

In [144]:
input_index = {word: i + 2 for i, word in enumerate(input_vocab)}
output_index = {word: i + 2 for i, word in enumerate(output_vocab)}

In [145]:
class EnRus(Dataset):
    def __init__(self, input_text, input_idx, target_text, target_idx):
        self.input_text = input_text
        self.input_idx = input_idx
        self.target_text = target_text
        self.target_idx = target_idx
        
     
    @classmethod
    def get_index(self, sentence, vocab, idx):
        return [vocab.get(word, 0) for word in sentence[idx].split(' ')]
    
    @classmethod
    def add_one(self, sentence, vocab, idx):
        indexes = self.get_index(sentence, vocab, idx)
        indexes.append(1)
        return torch.tensor(indexes, dtype=torch.long).view(-1, 1)
    
    def final_tensor(self, idx):
        input_tensor = self.add_one(self.input_text, self.input_idx, idx)
        output_tensor = self.add_one(self.target_text, self.target_idx, idx)
        return (input_tensor, output_tensor)
    
    def __len__(self):
        return len(self.input_text)

    
    def __getitem__(self, idx):
        pair = self.final_tensor(idx)
        return pair
     

In [176]:
data = EnRus(input_text=input_texts, input_idx=input_index, target_text=target_texts, target_idx=output_index)

In [177]:
final_data = DataLoader(data, batch_size=30, shuffle=True, drop_last=True)

In [178]:
class DecoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, input_size)
        
        
    def forward(self, input_, hidden, encoder_outputs):
        embedded = self.embedding(input_).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden.view(1, 1, 1, -1))
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output

In [179]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        #output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [180]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]])

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == 1:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [140]:
# Здесь обработка только через Dataset, ниже - попытка в DataLoader
# Качество особо не улучшилось
encoder = EncoderRNN(len(input_index)+2, 30)
attn_decoder1 = AttnDecoderRNN(30, len(output_index)+2, dropout_p=0.1)

#attn_decoder1 = DecoderRNN(len(output_vocab2index)+2, 30)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(attn_decoder1.parameters(), lr=0.01)
training_pairs = np.random.randint(0, len(input_texts), size=100000)
criterion = nn.NLLLoss()

print_loss_total = 0
for i in range(100000):    
        input_tensor, target_tensor = data[training_pairs[i]]

        loss = train(input_tensor, target_tensor, encoder,
               attn_decoder1, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
    
        print_loss_avg = print_loss_total / 1
        print_loss_total = 0
        if i != 0 and i % 1000 == 0:
            print('(%d %d%%) %.4f' % (i, i / 10 * 100, print_loss_avg))
            #print(f'Epoch {i} Loss {print_loss_avg}')



(1000 10000%) 2.8416
(2000 20000%) 4.5871
(3000 30000%) 2.5645
(4000 40000%) 3.0038
(5000 50000%) 1.6322
(6000 60000%) 1.5926
(7000 70000%) 2.8010
(8000 80000%) 4.1900
(9000 90000%) 2.6429
(10000 100000%) 0.2757
(11000 110000%) 3.1192
(12000 120000%) 3.3673
(13000 130000%) 1.4466
(14000 140000%) 3.3976
(15000 150000%) 3.2623
(16000 160000%) 0.3130
(17000 170000%) 2.1639
(18000 180000%) 2.3759
(19000 190000%) 2.4264
(20000 200000%) 1.6752
(21000 210000%) 1.5989
(22000 220000%) 5.1185
(23000 230000%) 2.0581
(24000 240000%) 1.2077
(25000 250000%) 4.0137
(26000 260000%) 2.0311
(27000 270000%) 3.2472
(28000 280000%) 2.1888
(29000 290000%) 5.1681
(30000 300000%) 3.0166
(31000 310000%) 3.0915
(32000 320000%) 1.3094
(33000 330000%) 3.5077
(34000 340000%) 3.9526
(35000 350000%) 3.1285
(36000 360000%) 0.1574
(37000 370000%) 2.0406
(38000 380000%) 1.7910
(39000 390000%) 0.3538
(40000 400000%) 0.7997
(41000 410000%) 2.8695
(42000 420000%) 4.0073
(43000 430000%) 3.0234
(44000 440000%) 3.2112
(45000

In [183]:
# Моя любимая ошибка размерности, которую я не одолела
# Хочу увидеть решение!!
encoder = EncoderRNN(len(input_index)+2, 30)
attn_decoder1 = AttnDecoderRNN(30, len(output_index)+2, dropout_p=0.1)

#attn_decoder1 = DecoderRNN(len(output_vocab2index)+2, 30)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(attn_decoder1.parameters(), lr=0.01)
training_pairs = np.random.randint(0, len(input_texts), size=10000)
criterion = nn.NLLLoss()

print_loss_total = 0
for i in range(10000): 
    for index, (x, y) in enumerate(final_data):
        input_tensor, target_tensor = x, y
  
        loss = train(input_tensor,
                     target_tensor, 
                     encoder,
               attn_decoder1, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
    
        print_loss_avg = print_loss_total / 1
        print_loss_total = 0
        if i != 0 and i % 1000 == 0:
            print('(%d %d%%) %.4f' % (i, i / 10 * 100, print_loss_avg))
            #print(f'Epoch {i} Loss {print_loss_avg}')

RuntimeError: stack expects each tensor to be equal size, but got [4, 1] at entry 0 and [3, 1] at entry 1