## Importing nessesary libraries

In [96]:
import unicodedata
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch

## Importing data

In [2]:
import tensorflow as tf

In [3]:
# Download the file
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [5]:
target_raw, context_raw = load_data(path_to_file)

In [6]:
idx = -1
print(target_raw[idx])
print(context_raw[idx])

If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.
Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


## Preprocessing: Text normalization

In [7]:
def normalize_unicode(text, normalization_form='NFC'):
    normalized_text = unicodedata.normalize(normalization_form, text)
    return normalized_text

In [8]:
def normalize(text_array):
    return_array = []
    for sentence in text_array:
        sentence = sentence.lower()
        sentence = normalize_unicode(sentence)
        sentence = sentence.replace('.', '')
        return_array.append(sentence)
    return np.array(return_array)

In [9]:
print(f"Length of context raw: {len(context_raw)}")
print(f"Length of target raw: {len(target_raw)}")

Length of context raw: 118964
Length of target raw: 118964


In [10]:
context = normalize(context_raw)
target = normalize(target_raw)

In [11]:
context

array(['ve', 'vete', 'vaya', ...,
       'una huella de carbono es la cantidad de contaminación de dióxido de carbono que producimos como producto de nuestras actividades algunas personas intentan reducir su huella de carbono porque están preocupados acerca del cambio climático',
       'como suele haber varias páginas web sobre cualquier tema, normalmente sólo le doy al botón de retroceso cuando entro en una página web que tiene anuncios en ventanas emergentes simplemente voy a la siguiente página encontrada por google y espero encontrar algo menos irritante',
       'si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado'],
      dtype='<U276')

In [12]:
context_raw

array(['Ve.', 'Vete.', 'Vaya.', ...,
       'Una huella de carbono es la cantidad de contaminación de dióxido de carbono que producimos como producto de nuestras actividades. Algunas personas intentan reducir su huella de carbono porque están preocupados acerca del cambio climático.',
       'Como suele haber varias páginas web sobre cualquier tema, normalmente sólo le doy al botón de retroceso cuando entro en una página web que tiene anuncios en ventanas emergentes. Simplemente voy a la siguiente página encontrada por Google y espero encontrar algo menos irritante.',
       'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'],
      dtype='<U278')

In [13]:
print(f"Length of context preprocessed: {len(context)}")
print(f"Length of target preprocessed: {len(target)}")

Length of context preprocessed: 118964
Length of target preprocessed: 118964


## Preprocessing: Vectorization

In [14]:
START_TOKEN = 0
PAD_TOKEN = 1
END_TOKEN = 2
UNK_TOKEN = 3

In [15]:
class Vocab:
    def __init__(self, text_array, freq=2):
        self.text_array = text_array
        self.ttov = {"<sos>": START_TOKEN, "<pad>": PAD_TOKEN,
                     "<eos>": END_TOKEN, "<unk>": UNK_TOKEN}
        self.vtot = dict((idx, token) for token, idx in self.ttov.items())
        self.length = 4
        self.counter = Counter()
        self.freq = freq
        self.build_vocab()

    def add(self, token):
        self.ttov[token] = self.length
        self.vtot[self.length] = token
        self.length += 1

    def build_vocab(self):
        for sentence in self.text_array:
            for token in sentence.split():
                self.counter.update([token])

        for token, counter in self.counter.items():
            if counter >= self.freq:
                self.add(token)

    def vecToText(self, vec):
        text = []
        for v in vec:
            if v in self.vtot:
                text.append(self.vtot[v])
            else:
                text.append("<unk>")
        return torch.tensor(text)

    def textToVec(self, text):
        vec = []
        for t in text:
            if t in self.ttov:
                vec.append(self.ttov[t])
            else:
                vec.append(UNK_TOKEN)
        return torch.tensor(vec)

In [16]:
context_vocab = Vocab(context)
target_vocab = Vocab(target)

In [17]:
print(f"Length of context dictionary: {context_vocab.length}")
print(f"Length of target dictionary: {target_vocab.length}")

Length of context dictionary: 18169
Length of target dictionary: 11401


## Dataset

In [18]:
class TextDataset(Dataset):
    def __init__(self, context_array, target_array, context_vocab, target_vocab):
        if len(context_array) != len(target_array):
            raise ValueError("Lengths of context and target must be equal.")
        self.context = context_array
        self.target = target_array
        self.context_vocab = context_vocab
        self.target_vocab = target_vocab
        self.length = len(context_array)
        self.context_max_length = self.getMaxLen(self.context)
        self.target_max_length = self.getMaxLen(self.target)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        context = self.context_vocab.textToVec(self.context[idx].split())
        target = self.target_vocab.textToVec(self.target[idx].split())
        return context, target

    def getMaxLen(self, text_array):
        return len(max(self.context, key=lambda x: len(x.split())).split())

In [19]:
data = TextDataset(context, target, context_vocab, target_vocab)
data.__getitem__(333)

(tensor([142, 311]), tensor([ 40, 192]))

In [20]:
CONTEXT_MAX_LENGTH = data.context_max_length
TARGET_MAX_LENGTH = data.target_max_length
print(f"Maximum length of context: {CONTEXT_MAX_LENGTH}")
print(f"Maximum length of target: {TARGET_MAX_LENGTH}")

Maximum length of context: 49
Maximum length of target: 49


## Dataloader

In [21]:
def collate_fn(batch):
    context, target = [data[0] for data in batch], [data[1] for data in batch]
    target_in = [torch.cat([torch.tensor([START_TOKEN]), text]) for text in target]
    target_out = [torch.cat([text, torch.tensor([END_TOKEN])]) for text in target]
    return pad(context, CONTEXT_MAX_LENGTH), pad(target_in, TARGET_MAX_LENGTH), pad(target_out, TARGET_MAX_LENGTH)

def pad(texts, max_len):
    padded = []
    for text in texts:
        while len(text) < max_len:
            text = torch.cat([text, torch.tensor([PAD_TOKEN])])
        padded.append(text)
    return torch.stack(padded)

In [22]:
d = DataLoader(data, batch_size=32, collate_fn=collate_fn)

In [23]:
batch = next(iter(d))
print(f"Shape of context: {batch[0].shape}")
print(f"Shape of target input: {batch[1].shape}")
print(f"Shape of target output: {batch[2].shape}")

Shape of context: torch.Size([32, 49])
Shape of target input: torch.Size([32, 49])
Shape of target output: torch.Size([32, 49])


## Encoder model

In [24]:
# B: Batch size
# L: Max length
# E: Embedding size
# H: Hidden size

class Encoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=PAD_TOKEN)
        self.rnn = nn.GRU(embed_size, hidden_size, bidirectional=True, batch_first=True)

    def forward(self, context):
        # Context: [B, L]
        x = self.embedding(context) # [B, L, E]
        x, h = self.rnn(x)  # x: [B, L, 2*H], h: [2, B, H]
        h = torch.cat([h[0:1], h[1:2]], dim=2)
        return x, h # x: [B, L, 2*H], h: [1, B, 2*H]

In [25]:
vocab_size = context_vocab.length
embed_size = 300
hidden_size = 200

In [26]:
encoder = Encoder(embed_size, vocab_size, hidden_size)

In [27]:
print(f"Shape of context: {batch[0].shape}")
result = encoder.forward(batch[0])
print(f"Shape of encoder output: {result[0].shape}")
print(f"Shape of hidden state: {result[1].shape}")

Shape of context: torch.Size([32, 49])
Shape of encoder output: torch.Size([32, 49, 400])
Shape of hidden state: torch.Size([1, 32, 400])


## Attention layer

In [28]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Q = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.K = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.V = nn.Linear(hidden_size * 2, 1)

    # Query: RNN decoder hidden state: [B, 1, 2*H]
    # Key: Encoder output: [B, L, 2*H]
    # Value: Encoder output: [B, L, 2*H]
    def forward(self, query, key):
        x = self.V(torch.tanh(self.Q(query) + self.K(key))) #[B, L, 1]
        x = x.squeeze(2).unsqueeze(1) #[B, 1, L]
        weights = F.softmax(x, dim=-1)
        context = torch.bmm(weights, key) #[B, 1, 2*H]
        return weights, context

In [29]:
attn = Attention(hidden_size)
query = result[1].permute(1, 0, 2)
key = result[0]
print(f"Shape of query: {query.shape} and shape of key: {key.shape}")

Shape of query: torch.Size([32, 1, 400]) and shape of key: torch.Size([32, 49, 400])


In [30]:
weights, context = attn.forward(query, key)
print(f"Shape of weights: {weights.shape} and shape of context: {context.shape}")

Shape of weights: torch.Size([32, 1, 49]) and shape of context: torch.Size([32, 1, 400])


## Decoder model

In [61]:
class Decoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=PAD_TOKEN)
        self.attention = Attention(hidden_size)
        self.rnn = nn.GRU(embed_size + 2 * hidden_size, hidden_size * 2, batch_first=True)
        self.out = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, encoder_out, hidden_init, target_in):
        decoder_hidden = hidden_init #[1, B, 2*H]
        decoder_outputs = []
        attentions = []

        for t in range(TARGET_MAX_LENGTH):
            decoder_in = target_in[:, t].unsqueeze(1) #[B, 1]
            output, decoder_hidden, weights = self.forward_step(encoder_out, decoder_hidden, decoder_in)
            decoder_outputs.append(output)
            attentions.append(weights)

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        attentions = torch.cat(attentions, dim=1)
        logits = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, encoder_out, decoder_hidden, decoder_in):
        token_embedded = self.embedding(decoder_in) #[B, 1, E]
        weights, context = self.attention(query=decoder_hidden.permute(1, 0, 2), key=encoder_out) #[B, 1, L], [B, 1, 2*H]
        gru_input = torch.concat([token_embedded, context], dim=2) #[B, 1, E+2*H]
        output, decoder_hidden = self.rnn(gru_input, decoder_hidden) #[B, 1, 2*H], [B, 1, 2*H]
        output = self.out(output) #[B, 1, 2*H]
        return output, decoder_hidden, weights #[B, 1, V], [1, B, 2*H]

In [62]:
decoder = Decoder(embed_size, vocab_size, hidden_size)

In [63]:
encoder_out = result[0]
hidden_init = result[1]
target_in = batch[1]
target_out = batch[2]
print(f"Shape of encoder output: {encoder_out.shape} and initial hidden vector: {hidden_init.shape}")
print(f"Shape of target input: {target_in.shape} and target output: {target_out.shape}")

Shape of encoder output: torch.Size([32, 49, 400]) and initial hidden vector: torch.Size([1, 32, 400])
Shape of target input: torch.Size([32, 49]) and target output: torch.Size([32, 49])


In [64]:
decoder_outputs, decoder_hidden, attentions = decoder.forward(encoder_out, hidden_init, target_in)

In [65]:
print(f"Shape of decoder output: {decoder_outputs.shape}")
print(f"Shape of decoder hidden: {decoder_hidden.shape}")
print(f"Shape of attention weights: {attentions.shape}")

Shape of decoder output: torch.Size([32, 49, 18169])
Shape of decoder hidden: torch.Size([1, 32, 400])
Shape of attention weights: torch.Size([32, 49, 49])


## Translation model

In [66]:
class Translator(nn.Module):
    def __init__(self, embed_size, hidden_size, input_vocab_size, output_vocab_size):
        super(Translator, self).__init__()
        self.encoder = Encoder(embed_size, input_vocab_size, hidden_size)
        self.decoder = Decoder(embed_size, output_vocab_size, hidden_size)

    def forward(self, context, target_in):
        encoder_out, hidden_init = self.encoder.forward(context)
        decoder_out, decoder_hidden, attentions = self.decoder.forward(encoder_out, hidden_init, target_in)
        return decoder_out, decoder_hidden, attentions

In [67]:
input_vocab_size = context_vocab.length
output_vocab_size = target_vocab.length
embed_size = 300
hidden_size = 200

In [68]:
model = Translator(embed_size, hidden_size, input_vocab_size, output_vocab_size)

In [69]:
context, target_in, target_out = batch
print(f"Context shape: {context.shape}")
print(f"Target input shape: {target_in.shape}")
print(f"Target output shape: {target_out.shape}")

Context shape: torch.Size([32, 49])
Target input shape: torch.Size([32, 49])
Target output shape: torch.Size([32, 49])


In [70]:
decoder_out, decoder_hidden, attentions = model.forward(context, target_in)

In [71]:
print(f"Shape of decoder output: {decoder_out.shape}")
print(f"Shape of decoder hidden: {decoder_hidden.shape}")
print(f"Shape of attention weights: {attentions.shape}")

Shape of decoder output: torch.Size([32, 49, 11401])
Shape of decoder hidden: torch.Size([1, 32, 400])
Shape of attention weights: torch.Size([32, 49, 49])


## Training loop

In [109]:
def train(model, train_data, optimizer, criterion, num_epochs, val_data=None):

    av_loss = []
    for epoch in range(1, num_epochs + 1):
        total_loss = 0
        count = 0
        val_data = iter(val_data) if val_data is not None else None

        for context, target_in, target_out in train_data:
            optimizer.zero_grad()
            logits, hidden, attentions = model.forward(context, target_in)
            loss = criterion(logits.view(-1, logits.size(-1)), target_out.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss
            print(f"Training loss: {loss.item()}") if count % 10 == 0 else None
            print(f"Logits contain nan") if torch.isnan(logits).any().item() else None

            if count % 50 == 0:
                if val_data is not None:
                    context, target_in, target_out = next(val_data)
                    logits, hidden, attentions = model.forward(context, target_in)
                    loss = criterion(logits.view(-1, logits.size(-1)), target_out.view(-1))
                    print(f"Validation loss: {loss.item()}")

            count += 1
        print(f"Epoch {epoch} with average loss {total_loss / count} --------------------------------------------")
        av_loss.append(total_loss / count)
    return av_loss

## Training preparations

In [110]:
target_raw, context_raw = load_data(path_to_file)
context = normalize(context_raw)
target = normalize(target_raw)
print(f"Length of preprocessed context: {len(context)}")
print(f"Length of preprocessed target: {len(target)}")

Length of preprocessed context: 118964
Length of preprocessed target: 118964


In [111]:
dataset = TextDataset(context, target, context_vocab, target_vocab)
CONTEXT_MAX_LENGTH = dataset.context_max_length
TARGET_MAX_LENGTH = dataset.target_max_length
print(f"Maximum length of context: {CONTEXT_MAX_LENGTH}")
print(f"Maximum length of target: {TARGET_MAX_LENGTH}")

Maximum length of context: 49
Maximum length of target: 49


In [112]:
print(f"Size of dataset: {len(dataset)}")
train_ratio, val_ratio = 0.6, 0.2
train_size, val_size = int(len(dataset) * train_ratio), int(len(dataset) * val_ratio)
test_size = len(dataset) - train_size - val_size
print(f"Size of training data: {train_size}, validation data: {val_size}, test_data: {test_size}")

Size of dataset: 118964
Size of training data: 71378, validation data: 23792, test_data: 23794


In [113]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [114]:
learning_rate = 0.001
optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [115]:
input_vocab_size = context_vocab.length
output_vocab_size = target_vocab.length
embed_size = 400
hidden_size = 256

## Training

In [116]:
model = Translator(embed_size, hidden_size, input_vocab_size, output_vocab_size)

In [117]:
num_epochs = 2

In [None]:
history = train(model, train_dataloader, optimizer, criterion, 2, val_data=val_dataloader)

Training loss: 0.019107336178421974
Validation loss: 0.019121477380394936
Training loss: 0.023945514112710953
