In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [8]:
class NMTVectorizer(object):

  def __init__(self, source_vocab, target_vocab, max_source_length,
               max_target_length):

    self.source_vocab = source_vocab
    self.target_vocab = target_vocab

    self.max_source_length = max_source_length
    self.target_vocab = target_vocab

    self.max_source_lenth = max_source_length
    self.max_target_length = max_target_length

@classmethod
def from_dataframe(cls, bitext_def):

  source_vocab = SequenceVocabulary()
  target_vocab = SequenceVocabulary()
  max_source_length, max_target_length = 0,0

  for _, row in bitext_df.itterows():
      source_tokens = row["source"]
      if len(source_tokens) > max_source_length:
          max_source_length = len(source_tokens)
      for token in source_tokens:
          source_vocab.add_token(token)

      target_tokens = row["target_language"].split(" ")
      if len(target_tokens) > max_target_length:
        max_target_length = len(target_tokens)
      for token in target_tokens:
        target_vocab.add_token(token)

  return cls(source_vocab, target_vocab, max_source_length,
            max_target_length)

In [11]:
class NMTVectorizer(object):
  def _vectorize(self,indices, vector_length=-1, mask_index=0):

    if vector_length < 0:
      vector_length = len(indices)
    vector = np.zeros(vector_length, dtype=np.int64)
    vector[:len(indices)] = indices
    vector[len(indices):] = mask_index
    return vector

  def _get_source_indices(self, text):

    indices = [self.source_vocab.begin_seq_index]
    indices.extend(self.source_vocab.lookup_token(token)
                  for token in text.split(" "))
    indices.append(self.source_vocab.end_seq__index)
    return indices

  def _get_target_indices(self,text):

    indices = [self.target_vocab.lookup_token(token)
              for token in text.split(" ")]
    x_indices = [self.target_vocab.begin_seq_index] + indices
    y_indices = indices + [self.target_vocab.end_seq_index]
    return x_indices, y_indices

  def vectorize(self, source_text, target_text, use_dataset_max_lengths=True):

    source_vector_length = -1
    target_vector_length = -1

    if use_dataset_max_lengths:
      source_vector_length = self.max_source_length + 2
      target_vector_length = self.max_target_length + 1

    source_indices = self._get_source_indices(source_text)
    source_vector_length = self._vectorize(source_indices,
                                           vector_length=source_vector_length,
                                           mask_index=self.source_vocab.mask_index)
    target_x_indices, target_y_indices = self._get_target_indices
    (target_text)
    target_x__vector = self._vectorize(target_x_indices,
                                       vector_length=target_vector_length,
                                       mask_index=self.target_vocab.mask_index)

    target_y_vector = self._vectorize(target_y_indices,
                                      vector_length=target_vector_length,
                                      mask_index=self.target_vocab.mask_index)
    return {"source_vector": source_vector,
            "target_x_vector": target_x_vector,
            "target_y_vector": target_y_vector,
            "source_length": len(source_indices)}

In [12]:
def generate_nmt_batches(dataset, batch_size, shuffle=True,
                            drop_last=True, device="cpu"):

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()

        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict

In [13]:
class NMTModel(nn.Module):

    def __init__(self, source_vocab_size, source_embedding_size,
                 target_vocab_size, target_embedding_size, encoding_size,
                 target_bos_index):

        super(NMTModel, self).__init__()
        self.encoder = NMTEncoder(num_embeddings=source_vocab_size,
                                  embedding_size=source_embedding_size,
                                  rnn_hidden_size=encoding_size)
        decoding_size = encoding_size * 2
        self.decoder = NMTDecoder(num_embeddings=target_vocab_size,
                                  embedding_size=target_embedding_size,
                                  rnn_hidden_size=decoding_size,
                                  bos_index=target_bos_index)

    def forward(self, x_source, x_source_lengths, target_sequence):

        encoder_state, final_hidden_states = self.encoder(x_source, x_source_lengths)
        decoded_states = self.decoder(encoder_state=encoder_state,
                                      initial_hidden_state=final_hidden_states,
                                      target_sequence=target_sequence)
        return decoded_states

In [14]:
class NMTEncoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):

        super(NMTEncoder, self).__init__()

        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.birnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)

    def forward(self, x_source, x_lengths):

        x_embedded = self.source_embedding(x_source)

        x_packed = pack_padded_sequence(x_embedded, x_lengths.detach().cpu().numpy(),
                                        batch_first=True)

        x_birnn_out, x_birnn_h  = self.birnn(x_packed)

        x_birnn_h = x_birnn_h.permute(1, 0, 2)

        x_birnn_h = x_birnn_h.contiguous().view(x_birnn_h.size(0), -1)

        x_unpacked, _ = pad_packed_sequence(x_birnn_out, batch_first=True)

        return x_unpacked, x_birnn_h

In [16]:
abcd_padded = torch.tensor([1,2,3,4], dtype=torch.float32)
efg_padded = torch.tensor([5, 6, 7, 0], dtype=torch.float32)
h_padded = torch.tensor([8, 0, 0, 0], dtype=torch.float32)

padded_tensor = torch.stack([abcd_padded, efg_padded, h_padded])

print(padded_tensor)

tensor([[1., 2., 3., 4.],
        [5., 6., 7., 0.],
        [8., 0., 0., 0.]])


In [17]:
lengths = [4, 3, 1]
packed_tensor = pack_padded_sequence(padded_tensor, lengths,
                                          batch_first=True)
packed_tensor

PackedSequence(data=tensor([1., 5., 8., 2., 6., 3., 7., 4.]), batch_sizes=tensor([3, 2, 2, 1]), sorted_indices=None, unsorted_indices=None)

In [18]:
unpacked_tensor, unpacked_lengths = \
  pad_packed_sequence(packed_tensor, batch_first=True)

print(unpacked_tensor)
print(unpacked_lengths)

tensor([[1., 2., 3., 4.],
        [5., 6., 7., 0.],
        [8., 0., 0., 0.]])
tensor([4, 3, 1])


In [19]:
class NMTDecoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):

        super(NMTDecoder, self).__init__()
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,
                                             embedding_dim=embedding_size,
                                             padding_idx=0)
        self.gru_cell = nn.GRUCell(embedding_size + rnn_hidden_size,
                                   rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.classifier = nn.Linear(rnn_hidden_size * 2, num_embeddings)
        self.bos_index = bos_index

    def _init_indices(self, batch_size):

        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index

    def _init_context_vectors(self, batch_size):

        return torch.zeros(batch_size, self._rnn_hidden_size)

    def forward(self, encoder_state, initial_hidden_state, target_sequence):

        target_sequence = target_sequence.permute(1, 0)
        output_sequence_size = target_sequence.size(0)

        h_t = self.hidden_map(initial_hidden_state)

        batch_size = encoder_state.size(0)

        context_vectors = self._init_context_vectors(batch_size)

        y_t_index = self._init_indices(batch_size)

        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)
        context_vectors = context_vectors.to(encoder_state.device)

        output_vectors = []
        self._cached_p_attn = []
        self._cached_ht = []
        self._cached_decoder_state = encoder_state.cpu().detach().numpy()

        for i in range(output_sequence_size):
            y_t_index = target_sequence[i]

            y_input_vector = self.target_embedding(y_t_index)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)

            h_t = self.gru_cell(rnn_input, h_t)
            self._cached_ht.append(h_t.cpu().detach().numpy())

            context_vectors, p_attn, _ = verbose_attention(encoder_state_vectors=encoder_state,
                                                           query_vector=h_t)

            self._cached_p_attn.append(p_attn.cpu().detach().numpy())

            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier(F.dropout(prediction_vector, 0.3))

            output_vectors.append(score_for_y_t_index)

In [20]:
def verbose_attention(encoder_state_vectors, query_vector):

    batch_size, num_vectors, vector_size = encoder_state_vectors.size()
    vector_scores = torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size),
                              dim=2)
    vector_probabilities = F.softmax(vector_scores, dim=1)
    weighted_vectors = encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1)
    context_vectors = torch.sum(weighted_vectors, dim=1)
    return context_vectors, vector_probabilities, vector_scores

def terse_attention(encoder_state_vectors, query_vector):

    vector_scores = torch.matmul(encoder_state_vectors, query_vector.unsqueeze(dim=2)).squeeze()
    vector_probabilities = F.softmax(vector_scores, dim=-1)
    context_vectors = torch.matmul(encoder_state_vectors.transpose(-2, -1),
                                   vector_probabilities.unsqueeze(dim=2)).squeeze()
    return context_vectors, vector_probabilities