In [21]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import pandas as pd

import os
import re
import random
import time
import datetime
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        
        # Кол-во обычых токенов
        self.n_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    # Удалите слов ниже определенного порога подсчета
    def trim(self, min_count):
        if self.trimmed: return
        self.trimmed = True
        
        keep_words = []
        
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words %s / %s = %.4f' % (
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        for word in keep_words:
            self.addWord(word)

In [23]:
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([,.!?])", r"", s)
    return s

In [24]:
def readLangs(lang1, lang2, reverse=False):
    print("Чтение строк...")

    filename = '%s-%s.txt' % (lang1, lang2)
    lines = open(filename, encoding='utf-8').read().strip().split('\n')

    ''' Разделение строк на пары и нормализация '''
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    ''' Переворот пар, создавание экземпляров класса Lang ''' 
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [25]:
MAX_LENGTH = 10

def filterPair(pairs):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)

In [26]:
def prepareData(lang1, lang2, reverse = False, filter_pairs_flag = False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print('Read %d sentence pairs' % len(pairs))
    
    if(filter_pairs_flag):
        pairs = filterPair(pairs)
    print('Отфильтровано пар: %d' % len(pairs))
    
    print('Индексирование слов...')
    for pair in pairs:
        for u in pair[0].split():
            input_lang.addWord(u)
        for u in pair[1].split():
            output_lang.addWord(u)
    
    print('Индексированных %d слов на входном языке, %d слов на выходном' % (input_lang.n_words, output_lang.n_words))
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('rus', 'eng', False)
print(random.choice(pairs))

Чтение строк...
Read 621481 sentence pairs
Отфильтровано пар: 621481
Индексирование слов...
Индексированных 95135 слов на входном языке, 35714 слов на выходном
['постучим по дереву чтобы наша мечта стала явью', "let's knock on wood so that our dream will come true"]


## Фильтрация словарей

In [27]:
''' Минимальное кол-во встречающихся слов, которых стоит оставить '''
MIN_COUNT = 0

input_lang.trim(MIN_COUNT)
output_lang.trim(MIN_COUNT)

keep_words 95132 / 95132 = 1.0000
keep_words 35711 / 35711 = 1.0000


In [28]:
keep_pairs = []

for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    
    for word in input_sentence.split(' '):
        if word not in input_lang.word2index:
            keep_input = False
            break

    for word in output_sentence.split(' '):
        if word not in output_lang.word2index:
            keep_output = False
            break

    ''' Удаление пар если они не соответствуют условиям ввода и вывода ''' 
    if keep_input and keep_output:
        keep_pairs.append(pair)
        
print("Отброшено %d пар до %d, %.4f от общего числа" % (len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
pairs = keep_pairs

Отброшено 621481 пар до 621119, 0.9994 от общего числа


## Преобразование обучающих данных в тензоры

In [29]:
# Возвращает список индексов, по одному для каждого слова в предложении, плюс EOS '''
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [30]:
# Заполнение a символом PAD 
def padSEQ(seq, max_length):
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

In [31]:
def randomBatch(batch_size):
    input_seqs = []
    target_seqs = []

    '''  Выбирайте случайные пары ''' 
    for i in range(batch_size):
        pair = random.choice(pairs)
        input_seqs.append(indexesFromSentence(input_lang, pair[0]))
        target_seqs.append(indexesFromSentence(output_lang, pair[1]))

    ''' Разделение на пары, сортировка по длине (по убыванию), разархивирование '''
    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    ''' Для входных и целевых последовательностей получим массив длин и заполните его от 0 до максимальной длины '''
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [padSEQ(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [padSEQ(s, max(target_lengths)) for s in target_seqs]

    ''' Превратим дополненные массивы в тензоры (batch_size x max_len), транспонировать в (max_len x batch_size) '''
    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
    
    input_var = input_var.to(device)
    target_var = target_var.to(device)
        
    return input_var, input_lengths, target_var, target_lengths

In [32]:
randomBatch(2)

(tensor([[    7,   150],
         [ 4121,   550],
         [  915,  1808],
         [    5,   374],
         [ 8652,  9536],
         [   73,  3575],
         [   41,     2],
         [    5,     0],
         [24469,     0],
         [    2,     0]]),
 [10, 7],
 tensor([[   20,    64],
         [  107,   797],
         [  434,   491],
         [ 5079,    22],
         [   92,    23],
         [  150,  1507],
         [  434,   228],
         [11448,    23],
         [    2,     2]]),
 [9, 9])

# Построение моделей

## Encoder

In [33]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        # Примечание: мы запускаем все это одновременно (в нескольких пакетах из нескольких последовательностей)
        embedded = self.embedding(input_seqs).view(1, 1, -1)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        return outputs, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [34]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Создаем переменную для хранения энергии внимания
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if USE_CUDA:
            attn_energies = attn_energies.cuda()
        
        # Для каждой партии выходов encoder'а
        for b in range(this_batch_size):
            # Вычисляет энергию для каждого выходного сигнала encoder'а
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Нормализуйте энергии до весов в диапазоне от 0 до 1, измените размер до 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = torch.dot(hidden.view(-1), energy.view(-1))
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = torch.dot(self.v.view(-1), energy.view(-1))
        return energy

In [41]:
class DecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()

        # Сохранение для справки
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Определение слоев
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Выбор модели внимания
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):

        # Получаем вложение текущего входного слова (последнее выходное слово)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

        # Получаем текущее скрытое состояние из входного слова и последнее скрытое состояние
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Вычисление внимание по текущему состоянию RNN и всем выходам  encoder'а
        # Примение к выходам encoder'а для получения средневзвешенного значения
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Вектор внимания с использованием скрытого состояния RNN и вектора контекста
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        output = self.out(concat_output)

        # Возвращение конечного результата, скрытого состояния и веса внимания (для визуализации)
        return output, hidden, attn_weights
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [37]:
small_batch_size = 2
input_batches, input_lengths, target_batches, target_lengths = randomBatch(small_batch_size)

print('input_batches', input_batches.size()) # (max_len x batch_size)
print('target_batches', target_batches.size()) # (max_len x batch_size)

input_batches torch.Size([9, 2])
target_batches torch.Size([10, 2])


In [46]:
small_hidden_size = 8
small_n_layers = 2

encoder_test = EncoderRNN(input_lang.n_words, small_hidden_size, small_n_layers)
decoder_test = DecoderRNN('general', small_hidden_size, output_lang.n_words, small_n_layers)

print(encoder_test.to(device))
print(decoder_test.to(device))

EncoderRNN(
  (embedding): Embedding(95135, 8)
  (gru): GRU(8, 8, num_layers=2, dropout=0.1, bidirectional=True)
)
DecoderRNN(
  (embedding): Embedding(35714, 8)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(8, 8, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=16, out_features=8, bias=True)
  (out): Linear(in_features=8, out_features=35714, bias=True)
  (attn): Attn(
    (attn): Linear(in_features=8, out_features=8, bias=True)
  )
)


In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 15000, print_every=500)

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

In [None]:
def evaluateA(input_sentence):
    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))

In [None]:
evaluateA('mather')

In [None]:
torch.save(EncoderRNN.state_dict(), 'C:\\Users\\Onigatari\\Desktop\\Graduate-Work\\Notebook')
torch.save(DecoderRNN.state_dict(), 'C:\\Users\\Onigatari\\Desktop\\Graduate-Work\\Notebook')
torch.save(AttnDecoderRNN.state_dict(), 'C:\\Users\\Onigatari\\Desktop\\Graduate-Work\\Notebook')