# Load Dependencies

In [1]:
%%capture
!pip install kaggle
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import math
import copy
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import random
import nltk
import re
nltk.download("punkt")
nltk.download('stopwords')

In [2]:
!mkdir /root/.kaggle/
!cp -f ./kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json
!rm -f kaggle.json

mkdir: cannot create directory ‘/root/.kaggle/’: File exists
cp: cannot stat './kaggle.json': No such file or directory


In [3]:
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset
!unzip en-fr-translation-dataset.zip
!rm -f en-fr-translation-dataset.zip

Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [00:37<00:00, 54.2MB/s]
100% 2.54G/2.54G [00:37<00:00, 72.8MB/s]
Archive:  en-fr-translation-dataset.zip
  inflating: en-fr.csv               


# Load Dataset From Kaggle  

In [3]:
# Load in the CSV File
en_fr_csv = pd.read_csv("en-fr.csv", nrows =  100)

In [4]:
english_corpus = list(en_fr_csv['en'])
french_corpus = list(en_fr_csv['fr'])

In [5]:
def process_corpus(english, french, min_length = 15):
  corpus_english_tokenized = []
  corpus_french_tokenized = []
  for sentence_idx in tqdm.tqdm(range(len(english))):
    english_sentence = english[sentence_idx]
    french_sentence = french[sentence_idx]
    english_tokenized = []
    french_tokenized = []
    try:
      for english_word in nltk.word_tokenize(english_sentence, language = 'english'):
        processed_english = re.sub(r'[^\w\s]', "", str.lower(english_word))
        if processed_english != "":
          english_tokenized += [processed_english]
      for french_word in nltk.word_tokenize(french_sentence, language = 'french'):
        processed_french = re.sub(r'[^\w\s]', "", str.lower(french_word))
        if processed_french != "":
          french_tokenized += [processed_french]
      if len(english_tokenized) >= min_length:
        corpus_english_tokenized += [english_tokenized]
        corpus_french_tokenized += [french_tokenized]
    except:
      continue
  return corpus_english_tokenized, corpus_french_tokenized

In [6]:
english_processed, french_processed = process_corpus(english_corpus, french_corpus)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [7]:
class EnFrDataset(keras.utils.Sequence):
  def __init__(self, english, french, batch_size):
    self.english = english
    self.french = french
    self.batch_size = batch_size
    self.cur_idx = 0
  def __len__(self):
    return len(self.english) // self.batch_size
  def __getitem__(self, idx):
    english_corpus = self.english[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
    french_corpus = self.french[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
    self.cur_idx += 1
    if self.cur_idx >= self.__len__():
      self.cur_idx = 0
    return english_corpus, french_corpus
    

In [8]:
TranslationDataset = EnFrDataset(copy.deepcopy(english_processed), copy.deepcopy(french_processed), 32)

Word Embeddings and loading Pretrained GLoVE vectors

In [10]:
!kaggle datasets download -d rtatman/glove-global-vectors-for-word-representation
!unzip glove-global-vectors-for-word-representation.zip
!rm -f glove.6B.100d.txt
!rm -f glove.6B.50d.txt
!rm -f glove-global-vectors-for-word-representation.zip

Downloading glove-global-vectors-for-word-representation.zip to /content
 98% 450M/458M [00:05<00:00, 103MB/s]
100% 458M/458M [00:05<00:00, 88.4MB/s]
Archive:  glove-global-vectors-for-word-representation.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.50d.txt        


In [9]:
glove_path = "./glove.6B.200d.txt"

In [10]:
class WordEmbeddings(keras.layers.Layer):
  def __init__(self, corpus, glove_path, french = False):
    super().__init__()
    self.glove_path = glove_path
    self.french = french
    self.unique_words = self._compute_unique_words(corpus)
    self.word_2_idx = {self.unique_words[idx]: idx + 4 for idx in range(len(self.unique_words))}
    self.word_2_idx['<PAD>'] = 0
    self.word_2_idx['<UNK>'] = 1
    self.word_2_idx["<START>"] = 2
    self.word_2_idx['<END>'] = 3

    self.idx_2_word = {idx + 4: self.unique_words[idx] for idx in range(len(self.unique_words))}
    self.idx_2_word[0] = "<PAD>"
    self.idx_2_word[1] = "<UNK>"
    self.idx_2_word[2] = "<START>"
    self.idx_2_word[3] = "<END>"

    self.len_vocab = len(self.unique_words) + 4
    
    self.embedding_dim = 200
    if not self.french:
      self.glove_embeddings = self.load_glove(self.glove_path)
      self.word_embeddings = keras.layers.Embedding(self.len_vocab, self.embedding_dim, weights = [self.glove_embeddings])
    else:
      self.word_embeddings = keras.layers.Embedding(self.len_vocab, self.embedding_dim)
  def _compute_unique_words(self, corpus):
    '''
    Computes the Unique Words inside of the corpus
    corpus: the corpus to be trained on.
    '''
    words = [word for sentence in corpus for word in sentence]
    return list(set(words))
  def load_glove(self, glove_path):
    '''
    Loads in the GLoVE vectors 
    '''
    initializer = tf.initializers.GlorotUniform()
    embeddings = initializer(shape = (self.len_vocab, self.embedding_dim)).numpy()
    with open(glove_path, 'r') as file:
      for line in file:
        array = line.split()
        word = array[0]
        if word in self.word_2_idx:
          word_embed = np.array([float(val) for val in array[1:]])
          embeddings[self.word_2_idx[word]] = word_embed
    return embeddings
  def _max_length(self, x):
    '''
    Computes the maximum length of a corpus
    '''
    maximum_length = 0
    for sentence in x:
      maximum_length = max(len(sentence), maximum_length)
    return maximum_length
  def _tokenize(self, x, max_sent_length):
    '''
    Pads(or Truncates) and tokenizes every word in the corpus
    '''
    tokenized_sentences = []
    for sentence in x:
      tokenized_sent = [self.word_2_idx["<PAD>"]] * max_sent_length
      for word_idx in range(max_sent_length):
        if word_idx >= len(sentence):
          break
        word = sentence[word_idx]
        if word in self.word_2_idx:
          tokenized_sent[word_idx] = self.word_2_idx[word]
        else:
          tokenized_sent[word_idx] = self.word_2_idx['<UNK>']
      tokenized_sentences += [tokenized_sent]
    return tokenized_sentences
  def grab_embeddings(self, x):
    '''
    x: Tensor(B, L)
    '''
    return self.word_embeddings(x)
  def call(self, x, max_sent_length = None):
    '''
    Tokenizes and Grabs the Embeddings of a given sentence
    x: List of Sentences(List of Words)
    padded up to max_sent_length
    '''
    x = copy.deepcopy(x)
    if not max_sent_length:
      max_sent_length = self._max_length(x)
    for sentence_idx in range(len(x)):
      assert len(x[sentence_idx]) > 0, x
      x[sentence_idx] = ['<START>'] + x[sentence_idx]

      if len(x[sentence_idx]) > max_sent_length:
        x[sentence_idx][max_sent_length] = '<END>'
      else:
        x[sentence_idx][-1] = "<END>"
    tokens = np.array(self._tokenize(x, max_sent_length))
    #B, L = tokens.shape
    #for l in range(L):
    #  vals = tokens[0, l]
    #  print(self.idx_2_word[vals.item()])
    
    del x
    if not self.french:
      return self.word_embeddings(tokens)
    return tokens    

In [11]:
class EnglishFrenchEmbeddings(keras.layers.Layer):
  '''
  Stores the English and French Embeddings, which is handy during training
  '''
  def __init__(self, en_corpus, fr_corpus, glove_path):
    super().__init__()
    self.english = WordEmbeddings(en_corpus, glove_path)
    self.french = WordEmbeddings(fr_corpus, None, french = True)
  def call(self, text, max_length = None, french = False):
    if not french:
      return self.english(text, max_sent_length = max_length)
    else:
      return self.french(text, max_sent_length = max_length)


#LSTM From Scratch

Step 1: One LSTM Cell 

In [12]:
class LSTMCell(keras.layers.Layer):
  def __init__(self, cell_state):
    super().__init__()
    self.cell_state = cell_state
    self.i = keras.layers.Dense(self.cell_state, activation = 'sigmoid')
    self.f = keras.layers.Dense(self.cell_state, activation = 'sigmoid')
    self.o = keras.layers.Dense(self.cell_state, activation = 'sigmoid')
    self.g = keras.layers.Dense(self.cell_state, activation = 'tanh')
  def call(self, x, hidden_state, cell_state):
    '''
    x: Tensor(B, hidden_size)
    hidden_state: Tensor(B, hidden_size)
    cell_state: (B, cell_size)
    '''
    concat = tf.concat([x, hidden_state], axis = 1) # (B, 2 * hidden_size)
    i = self.i(concat) # (B, cell_state)
    f = self.f(concat) # (B, cell_state)
    o = self.o(concat) # (B, cell_state)
    g = self.g(concat) # (B, cell_state)
    forgotten = cell_state * f
    remember = i * g
    new_cell = remember + forgotten
    hidden_state = o * tf.tanh(new_cell)
    return hidden_state, new_cell

Step 2: Bidirection LSTM Encoder

In [13]:
class SingleDirectionLSTM(keras.layers.Layer):
  def __init__(self, hidden_size, cell_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.cell_size = cell_size
    # Learned Hidden and Cell State 
    initializer = tf.initializers.GlorotUniform()
    self.initial_hidden = tf.Variable(initial_value = initializer(shape = (1, self.hidden_size), dtype = "float32"), trainable = True)
    self.initial_cell = tf.Variable(initial_value = initializer(shape = (1, self.cell_size), dtype = 'float32'), trainable = True)
    self.LSTM_Cell = LSTMCell(self.cell_size)

  def call(self, x):
    '''
    Encodes a Given Input:
    x: Tensor(B, L, C)
    Returns all Hidden States
    '''
    B, L, _ = x.shape
    hidden_state = tf.repeat(tf.identity(self.initial_hidden), B, axis = 0)
    cell_state = tf.repeat(tf.identity(self.initial_cell), B, axis = 0)
    hidden_states = []
    for i in range(L):
      hidden_state, cell_state = self.LSTM_Cell(x[:, i, :], hidden_state,cell_state)
      hidden_states += [copy.deepcopy(hidden_state)]
    return tf.stack(hidden_states, axis = 1)


    

In [14]:
class BidirectionalLSTMEncoder(keras.layers.Layer):
  def __init__(self, hidden_size, cell_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.cell_size = cell_size
    self.LSTMCell = SingleDirectionLSTM(self.hidden_size, self.cell_size)
  def call(self, x):
    '''
    x: Tensor(B, L, C)
    '''
    reversed_x = []
    for i in range(x.shape[1] - 1, -1, -1):
      reversed_x += [x[:, i, :]]
    reversed_x = tf.stack(reversed_x, axis = 1) # (B, L, C)
    # Run Both the Reversed and Forward versions
    forward_hidden_states = self.LSTMCell(x) # (B, L, C)
    reversed_hidden_states = self.LSTMCell(reversed_x) # (B, L, C)

    output_hidden_state = tf.concat([forward_hidden_states[:, -1, :], forward_hidden_states[:, -1, :]], axis = 1) # (B, 2 * self.hidden_size)
    hidden_states = tf.concat([forward_hidden_states, reversed_hidden_states], axis = 2)
    
    return hidden_states, output_hidden_state

Step 3: Attention LSTM Decoder

In [85]:
class LuongAttention(keras.layers.Layer):
  '''
  Computes Multiplicative(Or Luong) Attention across hidden states 
  '''
  def __init__(self, decoder_size):
    super().__init__()
    self.decoder_size = decoder_size
    self.encoder_linear = keras.layers.Dense(self.decoder_size, activation = None)
  def call(self, decoder_state, encoder_states):
    '''
    decoder_state: Shape(B, decoder_size)
    encoder_states: Shape(B, L, encoder_size)
    '''
    # Expand Decoder State Across middle axis
    decoder_expanded = tf.expand_dims(decoder_state, 1) # (B, 1, decoder_size)
    encoder_linear = self.encoder_linear(decoder_expanded) # (B, L, decoder_size)
    # Dot Product
    att_mat = keras.activations.softmax(tf.matmul(decoder_expanded, tf.transpose(encoder_linear, perm = (0, 2, 1)))) # (B, 1, L)
    att_mat = tf.transpose(att_mat, perm = (0, 2, 1)) # (B, L, 1)
    att_scores = att_mat * encoder_linear
    return att_scores 
    

In [86]:
class LSTMDecoder(keras.layers.Layer):
  '''
  LSTM Decoder with no trainable hidden_state
  '''
  def __init__(self, hidden_size, cell_size, embeddings):
    super().__init__()
    self.embeddings = embeddings # This embeddings layer is used to extract special characters.
    self.hidden_size = hidden_size
    self.cell_size = cell_size
    initializer = tf.initializers.GlorotUniform()
    self.initial_cell_state = tf.Variable(initializer(shape = (1, self.cell_size), dtype = 'float32'), trainable = True)
    self.attention_fn = LuongAttention(self.hidden_size)
    self.LSTMCell = LSTMCell(self.cell_size)
    self.Dense = keras.layers.Dense(self.embeddings.french.len_vocab)
    self.loss_function = keras.losses.SparseCategoricalCrossentropy(from_logits = True)
  def call(self, last_hidden_state, encoder_states, y = None, max_length = 25, print_states = False):
    '''
    x: Tensor(B, 2 * hidden_size)
    encoder_states: Tensor(B, L_encoder, 2 * hidden_size)
    y: Tensor(B, L_Decoder), returns loss if y is provided(using teacher-student forcing.)
    Runs(AutoRegressively to predict the outputs)
    '''
    B, _ = last_hidden_state.shape
    _, L_encoder, _ = encoder_states.shape
    if type(y) != type(None):
      _, L_decoder = y.shape
    cell_state = tf.repeat(tf.identity(self.initial_cell_state), B, axis = 0) # (B, Cell_size)
    hidden_state = last_hidden_state
    if type(y) == type(None):
      # decode using no y values(final predictions), This should only be used at eval time
      # Initialize Start Keys as beginning inputs
      START_TOKEN = self.embeddings.french.word_2_idx["<START>"]
      END_TOKEN = self.embeddings.french.word_2_idx["<END>"]
      
      current_input = tf.expand_dims(tf.repeat(np.array(START_TOKEN), B, axis = 0), 1)
      current_input = tf.squeeze(self.embeddings.french.grab_embeddings(current_input))

      output_sentences =[["<START>"] for i in range(B)]
      ended_sentences = [False] * B
      # ended_sentences asks if the END token has been generated
      for i in range(max_length):
        hidden_state, cell_state = self.LSTMCell(current_input, hidden_state, cell_state)
        # Get the Hidden State and Use this for Attention Logits
        attention_logits = tf.squeeze(self.attention_fn(hidden_state, encoder_states))
        # Concatenate the Attention Logits and Hidden States
        attended = tf.concat([hidden_state, attention_logits], 1)
        logits = tf.keras.activations.softmax(self.Dense(attended))
        selected_words = tf.argmax(logits, axis = -1)
        B = selected_words.shape[0]
        #print('---------------------')
        #print(f"{self.embeddings.french.idx_2_word[selected_words[0].numpy().item()]}")
        #print('---------------------')
        for word_idx in range(B):
          if ended_sentences[word_idx]:
            continue
          if selected_words[word_idx].numpy().item() == END_TOKEN:
            # Sentence Finished
            ended_sentences[word_idx] = True
            output_sentences[word_idx] += [self.embeddings.french.idx_2_word[selected_words[word_idx].numpy().item()]]
          else:
            output_sentences[word_idx] += [self.embeddings.french.idx_2_word[selected_words[word_idx].numpy().item()]]
        # Create next input
        current_input = tf.expand_dims(selected_words, axis = 1)
        current_input = tf.squeeze(self.embeddings.french.grab_embeddings(current_input))
      return output_sentences
    else:
      total_loss = tf.identity(np.zeros((1))) # Create empty loss tensor
      for i in range(L_decoder - 1):
        # Take the input from ground truth, use to predict next word.
        y_input = y[:, i]
        not_pad = y_input != self.embeddings.french.word_2_idx["<PAD>"]
        y_input = tf.expand_dims(y_input, axis = 1)
        y_input = tf.squeeze(self.embeddings.french.grab_embeddings(y_input)) 
        # Grab embeddings
        y_GT = y[:, i + 1]
        hidden_state, cell_state = self.LSTMCell(y_input, hidden_state, cell_state)
        # Attention logits
        attention_logits = tf.squeeze(self.attention_fn(hidden_state, encoder_states))
        # Concatenate
        concatenated = tf.concat([hidden_state, attention_logits], 1)
        logits = self.Dense(concatenated)
      
        logits = tf.cast(logits, tf.double)
        # TEST
        if print_states:
          B, C = logits.shape
          test_logits = tf.keras.activations.softmax(logits)
          digit_predicted = tf.argmax(test_logits, axis = -1)
          #print('--------------------')
          #print(f"pred: {self.embeddings.french.idx_2_word[digit_predicted[0].numpy().item()]}")
          #print(f"GT: {self.embeddings.french.idx_2_word[y_GT[0].item()]}")
          #print('-------------------')
        logits = logits[not_pad] # Remove Sentences from the pad dimension, to not pollute the loss
        y_GT = y_GT[not_pad]

        
        # Compute loss
        total_loss = total_loss + self.loss_function(y_GT, logits)
      return total_loss / B


Step 4: LSTM Model

In [87]:
class NMTLSTM(keras.Model):
  def __init__(self, en_corpus, fr_corpus, glove_path, hidden_size, cell_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.cell_size = cell_size
    self.embeddings = EnglishFrenchEmbeddings(en_corpus, fr_corpus, glove_path)
    self.LSTMEncoder = BidirectionalLSTMEncoder(self.hidden_size, self.cell_size)
    self.Decoder = LSTMDecoder(2 * self.hidden_size, 2 * self.cell_size, self.embeddings)
  def call(self, x, y = None, print_states = False):
    '''
    Encodes and Decodes the text.
    x: List of sentences(list of words) in english
    y: List of sentences(list of words) in french 
    '''
    x_english = self.embeddings.call(x) # (B, L_encoder, 200)
    #print('---------------')
    if type(y) != type(None):
      y_french = self.embeddings.call(y, french = True) # (B, L_decoder)
    #print('------------------')
    # Encode the english words
    hidden_states, beginning_hidden_state = self.LSTMEncoder(x_english)
    # Shapes hidden_state: Tensor(B, L, 2 * hidden_size), beginning_hidden_state = Tensor(B, 2 * hidden_size)
    if type(y) != type(None):
      #B, L = y_french.shape
      #for l in range(L):
      #  vals = y_french[0, l]
      #  print(self.embeddings.french.idx_2_word[vals])
      #raise Exception()
      loss = self.Decoder(beginning_hidden_state, hidden_states, y = y_french, print_states = print_states)
      return loss
    else:
      output_sentence = self.Decoder(beginning_hidden_state, hidden_states)
      return output_sentence

In [88]:
model = NMTLSTM(english_processed, french_processed, glove_path, 256, 256)

In [None]:
model.load_weights("./model/model")

Step 5: Train the Model with Gradient Tape.

In [90]:
def training_fn(NUM_EPOCHS, display_every = 16):
  optimizer = tf.keras.optimizers.Adam(learning_rate = keras.optimizers.schedules.ExponentialDecay(1e-3, display_every, 0.9))
  for EPOCH in range(NUM_EPOCHS):
    total_loss = 0
    for i in tqdm.tqdm(range(display_every)):
      for english, french in TranslationDataset:
        with tf.GradientTape() as tape:
          loss = model(english, y = french, training = True)
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        del grads
        total_loss += loss.numpy().item()
        break
    # Visualize Trained Translations
    print(f"English: {TranslationDataset.english[0:2]}")
    print(f"French_pred: {model(TranslationDataset.english[0:2], training = False, print_states = True)}")
    print(f"French: {TranslationDataset.french[0:2]}")
    print(f"EPOCH: {EPOCH}, total_loss: {total_loss / display_every}")

In [91]:
def test_fn(english):
  '''
  Tests the model on a batch of english sentences 
  '''
  return model(english, training = False)

In [None]:
with tf.device("GPU:0"):
  training_fn(15, display_every = 32)

In [93]:
model.save_weights("./model/model")