Requirements

In [1]:
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

--------------------------------------------------------------

Data Stuff 

In [101]:
SOS = 0
EOS = 1

class Lang():
    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.word2count = {}
        self.idx2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.word2count[word] = 1
            self.idx2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1        

Convert Unicode (U+0041) -> ASCII (65)

In [102]:
def UnicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = UnicodeToAscii(s.strip().lower())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [103]:
def readLang(lang1, lang2, reverse=False):
    print("Reading lines....")  
    lines = open(f"data/{lang1}-{lang2}.txt").read().strip().split("\n")
    
    pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)   

    return input_lang, output_lang, pairs

In [104]:
MAX_LENGTH = 10

eng_prefix = (
    "i am", "i m",
    "he is", "he s",
    "she is", "she s",
    "you are", "you re",
    "we are", "we re",
    "they are", "they re",
)

def filterPair(p):
    return len(p[0].split(" ")) < MAX_LENGTH and \
           len(p[1].split(" ")) < MAX_LENGTH and \
           p[1].startswith(eng_prefix)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [105]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLang(lang1, lang2, reverse)
    print(f"Read {len(pairs)} sentence pairs")
    pairs = filterPairs(pairs)
    print(f"Trimmed to {len(pairs)} sentence pairs")
    print("Counting Words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted Words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("eng", "fra", True)
print(random.choice(pairs))

Reading lines....
Read 135842 sentence pairs
Trimmed to 12892 sentence pairs
Counting Words...
Counted Words:
fra 5228
eng 3434
['ils sont avec moi', 'they re with me']


--------------------------------------------------------

Seq2Seq Model

In [106]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, input):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        output, hidden = self.lstm(embedded)
        return output, hidden
        


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            _, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None
    

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output, hidden

---
Train

In [107]:
def indexesFromSentence(lang, sentence):
    return [lang.word2idx[word] for word in sentence.split(" ")]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, paris = prepareData('eng', 'fra', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS)
        tgt_ids.append(EOS)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, : len(tgt_ids)] = tgt_ids
    
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device), torch.LongTensor(target_ids).to(device))


    train_sample = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sample, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [108]:
decoder = torch.rand(32, 10, 5000)
print(decoder.shape)
new_decoder = decoder.view(-1, decoder.size(-1))
print(new_decoder.shape)

torch.Size([32, 10, 5000])
torch.Size([320, 5000])


In [109]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    total_loss = 0

    for data in dataloader:
        input_tensor, target_tensor= data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

        # Most loss function expect:
        # Prediction: (N, C) where N = the number of samples, C = number of classes
        # Target/Ground truth: (N,) where N = number of samples
        # decoder_outputs: (batch_size, seq_len, vocab_len) -> (batch_size * seq_len, vocab_len)
        # target_tensor: (batch_size, seq_len)-> (batch_size * seq_len,)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [110]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [111]:
def train(train_dataloader, encoder, decoder, epochs, lr=0.001, print_every=100, plot_every=100):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.NLLLoss()

    for epoch in range(1, epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f"Epoch {epoch:3d}/{epochs} ({epoch/epochs*100:5.1f}%) | Avg Loss: {print_loss_avg:.4f}")

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    showPlot(plot_losses)

In [112]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, _ = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_id = topi.squeeze()

        decoded_words = []

        for idx in decoded_id:
            if idx.item() == EOS:
                decoded_words.append("<EOS>")
                break
            decoded_words.append(output_lang.idx2word[idx.item()])

        return decoded_words

In [113]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print(">", pair[0])
        print("=", pair[1])
        output_words = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = " ".join(output_words)
        print("<", output_sentence)
        print("")

In [None]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_loader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_loader, encoder, decoder, epochs=80, print_every=5, plot_every=5)

Reading lines....
Read 135842 sentence pairs
Trimmed to 12892 sentence pairs
Counting Words...
Counted Words:
fra 5228
eng 3434
Epoch   5/80 (  6.2%) | Avg Loss: 2.2408


In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> je suis en vacances
= i m on holiday
< i m the one <EOS>

> il est cense etre chez lui aujourd hui
= he is supposed to be at home today
< he is used to be in to <EOS>

> il me faut trouver de nouvelles amies
= i must find some new friends
< i must find new new new <EOS>

> vous avez tort
= you are wrong
< you re wrong <EOS>

> elle est une vrai beaute
= she is a real beauty
< she is a doctor beautiful <EOS>

> nous sommes pareils
= we re the same
< we re the here <EOS>

> j ai emis des reserves
= i made reservations
< i made you <EOS>

> je ne suis toujours pas impressionne
= i m still not impressed
< i m not not either <EOS>

> nous sommes tres reconnaissants pour votre hospitalite
= we re very grateful for your hospitality
< we re very grateful for your hospitality <EOS>

> il tira sur l oiseau mais le manqua
= he shot at the bird but missed it
< he saw the the bird the the <EOS>



----
Benchmark

In [1]:
import nltk
nltk.download('punkt')

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

[nltk_data] Downloading package punkt to /home/saif/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def calculate_blue(encoder, decoder, pairs, input_lang, output_lang, n_samples=100):
    reference = []
    hypotheses = []

    test_paris = random.sample(pairs, min(n_samples, len(pairs)))

    for pair in test_paris:
        reference = pair[1].split() #target
        reference.append([reference])

        predicted_words = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        if "<EOS>" in predicted_words:
            predicted_words = predicted_words[:predicted_words.index("<EOS>")]
        hypotheses.append(predicted_words)
    
    smoothie = SmoothingFunction().method4
    blue_score = corpus_bleu(reference, hypotheses, smoothing_function=smoothie)

    return blue_score * 100


In [7]:
A = torch.randn((10,20))
B = torch.randn((10,20))
C = torch.randn((10,20))


W = torch.hstack([A, B, C])
W.shape

torch.Size([10, 60])