In [1]:
%matplotlib inline

from io import open
import unicodedata
import string
import re
import random
import zipfile
# import tkinter

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare Data

In [2]:
with zipfile.ZipFile('rus-eng.zip') as myzip:
    myzip.extractall()

In [3]:
with open('rus.txt', encoding='utf-8') as file:
    lines = file.readlines()
    print(*random.sample(lines, 10))

She's promised to give me a ring.	Она обещала подарить мне кольцо.	CC-BY 2.0 (France) Attribution: tatoeba.org #568985 (CM) & #3175817 (Selena777)
 We should try to understand one another.	Мы должны попытаться понять друг друга.	CC-BY 2.0 (France) Attribution: tatoeba.org #247923 (CK) & #1716185 (marafon)
 You need to do it now.	Вам нужно сделать это сейчас.	CC-BY 2.0 (France) Attribution: tatoeba.org #9302089 (CK) & #9099761 (marafon)
 Put the book on the top shelf.	Положи книгу на верхнюю полку.	CC-BY 2.0 (France) Attribution: tatoeba.org #43934 (CK) & #3517979 (odexed)
 I've been waiting for Tom since 2:30.	Я жду Тома с половины третьего.	CC-BY 2.0 (France) Attribution: tatoeba.org #2359323 (CK) & #5969383 (sharptoothed)
 You're really dirty.	Ты очень грязный.	CC-BY 2.0 (France) Attribution: tatoeba.org #6099168 (CK) & #8005105 (marafon)
 You're screaming.	Вы кричите.	CC-BY 2.0 (France) Attribution: tatoeba.org #2203794 (CM) & #5684435 (marafon)
 We're still not allowed to enter.	На

In [4]:
with open('eng-rus.txt', 'w', encoding='utf-8') as file:
    for line in lines:
        stripped = line.split('\t')
        file.write(stripped[0] + '\t' + stripped[1] + '\n')

In [5]:
with open('eng-rus.txt', encoding='utf-8') as file:
    lines = file.readlines()
    print(*random.sample(lines, 10))

This is an ax.	Это топор.
 Some politicians are wolves in sheep's clothing.	Некоторые политики — это волки в овечьих шкурах.
 She disappeared in the dark.	Она исчезла в темноте.
 You don't look like you're sleepy.	Не похоже, чтобы вы хотели спать.
 She's trying on a coat.	Она примеряет пальто.
 We know what you're trying to do.	Мы знаем, что ты пытаешься сделать.
 He brought me the news that our team had won.	Он принёс мне известие, что наша команда одержала победу.
 Don't expect Tom to agree to do that.	Не ждите, что Том согласится это делать.
 You look awful.	Выглядите ужасно.
 Pick your toys up.	Собери игрушки.



In [6]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z-А-аЯ-я.!?]+", r" ", s)
    return s

In [8]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [9]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [10]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Reading lines...
Read 479223 sentence pairs
Trimmed to 27844 sentence pairs
Counting words...
Counted words:
rus 10125
eng 4320
['вы живете прошлым .', 'you re living in the past .']


# Define train function

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [12]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [15]:

import matplotlib.pyplot as plt

plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt

In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [17]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

# GRU 1 layer

In [18]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [20]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 75000, print_every=5000)

1m 6s (- 15m 29s) (5000 6%) 3.1277
2m 5s (- 13m 38s) (10000 13%) 2.6288
3m 25s (- 13m 42s) (15000 20%) 2.3579
4m 30s (- 12m 24s) (20000 26%) 2.1682
5m 52s (- 11m 44s) (25000 33%) 1.9784
8m 46s (- 13m 10s) (30000 40%) 1.8446
10m 48s (- 12m 21s) (35000 46%) 1.7308
15m 57s (- 13m 57s) (40000 53%) 1.6196
21m 59s (- 14m 39s) (45000 60%) 1.5346
27m 30s (- 13m 45s) (50000 66%) 1.4895
29m 25s (- 10m 42s) (55000 73%) 1.3959
31m 28s (- 7m 52s) (60000 80%) 1.3220
33m 30s (- 5m 9s) (65000 86%) 1.2973
35m 29s (- 2m 32s) (70000 93%) 1.2075
37m 29s (- 0m 0s) (75000 100%) 1.1804


In [21]:
evaluateRandomly(encoder1, decoder1)

> они быстро двигаются .
= they re moving fast .
< they re in a . . <EOS>

> мы пришли тебя поддержать .
= we re here to support you .
< we re here to support you . <EOS>

> я очень доволен своим новым домом .
= i m very pleased with my new house .
< i m very pleased with my new . <EOS>

> вам в эту комнату нельзя .
= you aren t allowed in this room .
< you re not allowed to leave this room . <EOS>

> мы тебя защитим .
= we re going to protect you .
< we re going to find you . <EOS>

> меня почти никогда не бывает дома .
= i m almost never home .
< i m almost never home . <EOS>

> боюсь будет дождь .
= i m afraid it will rain .
< i m afraid it will rain rain . <EOS>

> вам ведь лучше ?
= you re feeling better aren t you ?
< you re feeling better aren t you ? <EOS>

> они на минут отстают от графика .
= they re minutes behind schedule .
< they re running minutes behind schedule . <EOS>

> я глухои .
= i m deaf .
< i m a . <EOS>



Полученный перевод явно уловил некоторые верноые закономерности и способен выдать правильный ответ на простое предложение, но также зачастую не является верным. Наиболее часто встречаются проблемы с переводом последнего слова

# GRU 2 layer

In [22]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

In [23]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(2, 1, self.hidden_size, device=device)

In [24]:
hidden_size = 256
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder2 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder2, decoder2, 75000, print_every=5000)

2m 54s (- 40m 36s) (5000 6%) 3.1445
5m 23s (- 35m 2s) (10000 13%) 2.6968
8m 2s (- 32m 11s) (15000 20%) 2.4536
10m 20s (- 28m 25s) (20000 26%) 2.2478
13m 0s (- 26m 0s) (25000 33%) 2.0699
15m 31s (- 23m 16s) (30000 40%) 1.9201
17m 57s (- 20m 31s) (35000 46%) 1.8047
20m 32s (- 17m 58s) (40000 53%) 1.6864
23m 7s (- 15m 25s) (45000 60%) 1.6024
25m 30s (- 12m 45s) (50000 66%) 1.5047
27m 59s (- 10m 10s) (55000 73%) 1.4447
30m 31s (- 7m 37s) (60000 80%) 1.3737
32m 57s (- 5m 4s) (65000 86%) 1.3040
35m 15s (- 2m 31s) (70000 93%) 1.2499
37m 17s (- 0m 0s) (75000 100%) 1.2089


In [25]:
evaluateRandomly(encoder2, decoder2)

> она всего лишь ребенок .
= she s only a child .
< she s just a child . <EOS>

> тебе повезло что тебя не пристрелили .
= you re lucky you didn t get shot .
< you re lucky to be late . <EOS>

> мы оба очень заняты .
= we re both very busy .
< we re both very busy . <EOS>

> я ничего не боюсь .
= i m not frightened of anything .
< i m not afraid of anything . <EOS>

> он ботаник .
= he s a nerd .
< he s a beginner . <EOS>

> они приступают к выполнению домашнего задания .
= they are beginning their homework .
< they re making their on the way . <EOS>

> я иду .
= i m coming .
< i m coming . <EOS>

> он большои и сильныи .
= he s big and strong .
< he s a very strong . <EOS>

> ты добросовестныи .
= you re conscientious .
< you re a . <EOS>

> я уверен что том прав .
= i m sure tom is right .
< i m sure tom is . <EOS>



Хотя Loss в данном случае даже незначительно выше чем в случае с 1 слоем, перевод кажется даже более верным. Во многих случаях нейросеть использовала близкие по значению слова frightened - afraid, only - just и т.п. Также встречается некотороя проблема с переводом слов ближе к концу предложения. Судя по данному примеру, вариант с 2 уровнями в конце концов стал бы лучшим, если бы мы захотели расширить обучающую выборку и увеличить время обучения

# LSTM 2 layers

In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=2)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(2, 1, self.hidden_size, device=device), torch.zeros(2, 1, self.hidden_size, device=device))

In [27]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(2, 1, self.hidden_size, device=device), torch.zeros(2, 1, self.hidden_size, device=device))

In [28]:
hidden_size = 256
encoder3 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder3 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder3, decoder3, 75000, print_every=5000)

2m 33s (- 35m 51s) (5000 6%) 3.3465
5m 6s (- 33m 14s) (10000 13%) 2.8510
7m 33s (- 30m 13s) (15000 20%) 2.6657
10m 0s (- 27m 31s) (20000 26%) 2.5466
12m 27s (- 24m 55s) (25000 33%) 2.4033
14m 40s (- 22m 0s) (30000 40%) 2.2665
15m 57s (- 18m 14s) (35000 46%) 2.1525
17m 13s (- 15m 4s) (40000 53%) 2.0654
19m 7s (- 12m 44s) (45000 60%) 1.9323
20m 22s (- 10m 11s) (50000 66%) 1.8848
21m 39s (- 7m 52s) (55000 73%) 1.7928
22m 54s (- 5m 43s) (60000 80%) 1.7223
24m 10s (- 3m 43s) (65000 86%) 1.6378
25m 24s (- 1m 48s) (70000 93%) 1.5477
26m 39s (- 0m 0s) (75000 100%) 1.4955


In [29]:
evaluateRandomly(encoder3, decoder3)

> я доделываю уроки .
= i m finishing my homework .
< i m a my . . <EOS>

> вы не всегда правы .
= you re not always right .
< you re always wrong . <EOS>

> нам надо будет это проверить .
= we re going to have to verify this .
< we re going to have to to . . <EOS>

> ты стареешь .
= you re getting old .
< you re the . <EOS>

> я очень рад что том здесь .
= i m happy that tom is here .
< i m really happy happy tom here . <EOS>

> ты не слушаешь .
= you aren t listening .
< you re not going . <EOS>

> я уверен что том может это уладить .
= i m confident tom can fix it .
< i m sure tom tom do that . <EOS>

> мы здесь ради тома .
= we re here for tom .
< we re here for tom . <EOS>

> я уверен что вы вернетесь .
= i m sure you ll be back .
< i m sure you ll ll . <EOS>

> ты постоянно ко мне цепляешься .
= you re always finding fault with me .
< you re always criticizing to me you . <EOS>



В случая применения 2 уровневой LSTM ячейки мы опять же видим применение схожих смысловых конструкций, но в то же время здесь же наиболее видна проблема перевода концов предложений. Это же наблюдение подтверждаетися наибольшим показателем Loss`а. Похоже, что данная модель не очень хорошо справляется с длительным сохранением контекста.