In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [3]:
skip_training = False

In [4]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 10

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
#     s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    
    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [9]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('fin_train', 'mk_train', False)

Reading lines...
Read 1914246 sentence pairs
Counting words...
Trimmed to 1565490 sentence pairs
Counted words:
fin_train 515903
mk_train 363715


In [10]:
from torch.nn.utils.rnn import pad_sequence

def collate(list_of_samples):
    # sort a list by sequence length
    list_of_samples.sort(key=lambda x: len(x[0]), reverse=True)

    input_seqs, output_seqs = zip(*list_of_samples)
    input_seq_lengths = [len(seq) for seq in input_seqs]
    output_seq_lengths = [len(seq) for seq in output_seqs]

    padding_value = 0
    
    pad_input_seqs = pad_sequence(input_seqs, batch_first=False, padding_value=padding_value)
    pad_output_seqs = []
    for i in output_seqs:
        padded = i.new_zeros(max(output_seq_lengths) - i.size(0))
        pad_output_seqs.append(torch.cat((i, padded.view(-1, 1)), dim=0))
    
    pad_output_seqs = torch.stack(pad_output_seqs)
    pad_output_seqs = pad_output_seqs.permute(1, 0, 2)

    return pad_input_seqs, input_seq_lengths, pad_output_seqs, output_seq_lengths

In [11]:
# training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(10000)]
# training_pairs = [tensorsFromPair(pairs[i]) for i in range(0, len(pairs))]

In [12]:
# pairs_batch = DataLoader(dataset=training_pairs,
#                          batch_size=128,
#                          shuffle=True,
#                          collate_fn=collate,
#                          pin_memory=True)

In [13]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, dictionary_size, hidden_size, dropout_p=0.2, num_layers=2):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(dictionary_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, 
                          hidden_size, 
                          num_layers=self.num_layers,
                          bidirectional=True
                         )

    def forward(self, pad_seqs, seq_lengths, hidden):
        embedded = self.embedding(pad_seqs).squeeze(dim=2)
        packed = pack_padded_sequence(embedded, seq_lengths)
        outputs, hidden = self.lstm(packed)
        outputs = pad_packed_sequence(outputs)[0]
        return outputs, hidden[0]

    def init_hidden(self, batch_size=1, device=device):
        return torch.zeros(2*self.num_layers, batch_size, self.hidden_size, device=device)


In [14]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_dictionary_size, dropout_p=0.2, num_layers=2):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_dictionary_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, 
                          hidden_size, 
                          num_layers=self.num_layers,
                          bidirectional=False
                         )
        self.out = nn.Linear(hidden_size, output_dictionary_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, hidden, pad_target_seqs=None, teacher_forcing=False):
        if pad_target_seqs is None:
            assert not teacher_forcing, 'Cannot use teacher forcing without a target sequence.'

        batch_size = hidden.size(1)
        prev_word = torch.tensor(SOS_token * np.ones((1, batch_size)), device=device, dtype=torch.int64)
        max_length = pad_target_seqs.size(0) if pad_target_seqs is not None else MAX_LENGTH
        outputs = []  # Collect outputs of the decoder at different steps in this list
        for t in range(max_length):
            # YOUR CODE HERE
            prev_word = prev_word.view(1, -1)
            output = self.embedding(prev_word)
            output = F.relu(output)
            output, hidden = self.lstm(output)
            output = self.softmax(self.out(output))
    
            outputs.append(output)

            if teacher_forcing:
                # Feed the target as the next input
                prev_word = pad_target_seqs[t]
            else:
                # Use its own predictions as the next input
                topv, topi = output[0, :].topk(1)
                prev_word = topi.squeeze().detach()  # detach from history as input

        outputs = torch.cat(outputs, dim=0)  # [max_length, batch_size, output_dictionary_size]
        return outputs, hidden[0]

    def init_hidden(self, batch_size, device=device):
        return torch.zeros(1*self.num_layers, batch_size, self.hidden_size, device=device)

In [15]:
hidden_size = 256
encoder = Encoder(input_lang.n_words, hidden_size, dropout_p=0.5).to(device)
decoder = Decoder(hidden_size, output_lang.n_words, dropout_p=0.5).to(device)

In [16]:
criterion = nn.NLLLoss(reduction='none')  # Use this criterion in the loss calculations

def compute_loss(decoder_outputs, pad_target_seqs, padding_value=0):  
    n = 0
    loss = 0
    for i in range(decoder_outputs.shape[1]):
        p_arr = []
        t_arr = []
        p = decoder_outputs[:, i, :]
        t = pad_target_seqs[:, i].squeeze()
        for word in range(len(t)):
            if t[word] != padding_value:
                p_arr.append(p[word])
                t_arr.append(t[word])
                n += 1
            
        p_arr = torch.stack(p_arr)
        t_arr = torch.stack(t_arr)
        loss += criterion(p_arr, t_arr).sum()  
    loss = loss / n

    return loss

In [17]:
n_epochs = 10
teacher_forcing_ratio = 0.5

# loss_total = 0  # Reset every print_every

encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.1)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.1)

In [29]:
if skip_training == False:
    for epoch in range(n_epochs):
        training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(100000)]
        pairs_batch = DataLoader(dataset=training_pairs,
                         batch_size=128,
                         shuffle=True,
                         collate_fn=collate,
                         pin_memory=True)        
#         running_loss = 0.0
#         print_every = 10  # iterations
        
        for i, batch in enumerate(pairs_batch):
            pad_input_seqs, input_seq_lengths, pad_target_seqs, target_seq_lengths = batch
            batch_size = pad_input_seqs.size(1)
            pad_input_seqs, pad_target_seqs = pad_input_seqs.to(device), pad_target_seqs.to(device)

            encoder_hidden = encoder.init_hidden(batch_size, device)
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            # Encode input sequence
            _, encoder_hidden = encoder(pad_input_seqs, input_seq_lengths, encoder_hidden)

            # Decode using target sequence for teacher forcing
            decoder_hidden = encoder_hidden
            teacher_forcing = True if random.random() < teacher_forcing_ratio else False
            decoder_outputs, decoder_hidden = decoder(decoder_hidden, pad_target_seqs, teacher_forcing=teacher_forcing)

            # decoder_outputs is [max_seq_length, batch_size, output_dictionary_size]
            # pad_target_seqs in [max_seq_length, batch_size, 1]
            loss = compute_loss(decoder_outputs, pad_target_seqs, padding_value=0)
            loss.backward()

            encoder_optimizer.step()
            decoder_optimizer.step()

            # print statistics
#             running_loss += loss.item()
#             if (i % print_every) == (print_every-1) or i == (len(training_pairs) // pairs_batch.batch_size):
#                 print('[%d, %5d] loss: %.4f' % (epoch+1, i+1, running_loss/print_every))
#                 running_loss = 0.0

        print('[Epoch: %d] loss: %.4f' % (epoch+1, loss.item()))
    
    print('Final loss is: %f' % (loss.item()))
    print('Finished Training')

[Epoch: 1] loss: 12.6840
[Epoch: 2] loss: 12.6520
Final loss is: 12.651977
Finished Training


In [None]:
if skip_training == False:
    torch.save(encoder, 'model/encoder')
    torch.save(decoder, 'model/decoder')
else:
    encoder = torch.load('model/encoder')
    encoder.eval()
    
    decoder = torch.load('model/decoder')
    decoder.eval()

In [20]:
def evaluate(input_seq):
    with torch.no_grad():
        input_length = input_seq.size(0)
        batch_size = 1

        encoder_hidden = encoder.init_hidden(batch_size, device)
        input_seq = input_seq.view(-1, 1, 1).to(device)
        encoder_output, encoder_hidden = encoder(input_seq, [input_length], encoder_hidden)

        decoder_hidden = encoder_hidden
        decoder_outputs, decoder_hidden = decoder(decoder_hidden, pad_target_seqs=None, teacher_forcing=False)

        output_seq = []
        for t in range(decoder_outputs.size(0)):
            topv, topi = decoder_outputs[t].data.topk(1)
            output_seq.append(topi.item())
            if topi.item() == EOS_token:
                break

    return output_seq

In [21]:
# Evaluate random sentences from the training set
print('\nEvaluate on training data:')
print('-----------------------------')
for i in range(5):
    input_sentence, target_sentence = training_pairs[np.random.choice(len(training_pairs))]
    print('>', ' '.join(input_lang.index2word[i.item()] for i in input_sentence))
    print('=', ' '.join(output_lang.index2word[i.item()] for i in target_sentence))
    output_sentence = evaluate(input_sentence)
    print('<', ' '.join(output_lang.index2word[i] for i in output_sentence))
    print('')


Evaluate on training data:
-----------------------------
> - hän on ostoksilla. EOS
= отиде во продавница. EOS
< не,не,нема! tire откочи лажеше? лисгар. резервати. лисгар. резервати. лисгар. резервати.

> - lähde joukkueen kanssa pelimatkalle. EOS
= оди на турнејата со тимот. EOS
< не,не,нема! tire откочи лажеше? лисгар. резервати. лисгар. резервати. лисгар. резервати.

> heinoa. 500 000 japanilaiselta herrasmieheltä. EOS
= 500.000 долари за господин од јапонија. EOS
< не,не,нема! tire откочи лажеше? лисгар. резервати. лисгар. резервати. лисгар. резервати.

> - ei hän ole kuviteltu. EOS
= не е? EOS
< не,не,нема! tire откочи лажеше? лисгар. резервати. лисгар. резервати. лисгар. резервати.

> - minun täytyy käydä vessassa. EOS
= морам да мочам. EOS
< не,не,нема! tire откочи лажеше? лисгар. резервати. лисгар. резервати. лисгар. резервати.

