In [None]:
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, callbacks
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/gdrive/')
%cd /content/gdrive/My\ Drive/Seq2Seq\-DD\-RNN


sys.path.append('/content/gdrive/My Drive/Seq2Seq-DD-RNN/attention.py')
from attention import LuongAttention


In [None]:
import time

In [None]:
def char_index_map():
  # sanskrit characters unicode range from 0x900 - 0x97F

  # convert to integer representation 
  begin = int("0x900",16)
  end = int("0x97F",16)

  char_to_index  = {}
  index = 1

  ## there are 128 characters in sanskrit
  ## values 129, 130, 131 are +, S,E

  for i in range(begin, end+1):
    char_to_index[chr(i)] = index 
    index += 1

  char_to_index["+"] = index 
  char_to_index["S"] = index+1
  char_to_index['E'] = index+2
  
  index_to_char = {} 
  val = begin

  for i in range(1,129):
    index_to_char[i] = chr(val)
    val += 1

  index_to_char[129] = "+"
  index_to_char[130] = "S"
  index_to_char[131] = "E"

  return char_to_index, index_to_char



def seq_length(text):
    # find longest string length 
    max_len = -1
    for i in range(text.shape[0]):
      max_len = max(len(text[i][0]),max_len)
    return max_len

In [None]:
class Decoder(Model):

  def __init__(self, out_dim,  
               attention, embedded_layer=None, 
               hidden_units= 512, dropout=0.3):
    # call base constructor from child
    super(Decoder,self).__init__()

    self.out_dim =  out_dim
    self.embedded_layer = embedded_layer # share embedded layer.
    self.latent_dims = hidden_units  
    self.attention = attention  # attention layer.

    self.lstm = layers.LSTM(hidden_units,
                            return_sequences=True,
                            return_state=True,
                            dropout=0.3)
    
    self.wa = layers.Dense(hidden_units, activation="tanh")
    self.fc = layers.Dense(out_dim, activation = "softmax")

  def __call__(self,x, states, encoder_outputs):
    
   
    # encode, x shape(batch_size, 1)
    x = self.embedded_layer(x)
    
    x = tf.expand_dims(x,1)
    lstm_out,hidden_state, cell_state = self.lstm(x, initial_state = states)
    
    #lstm_out  shape (batch_size, 1, hidden_units)
    #hidden_state, cell_state  shape (batch_size, hidden_units)
  
    # apply attention
    context_vector, attention_weights = self.attention(hidden_state, encoder_outputs) 
    # context_vector is of shape (batch_size, time_stps, hidden_units)
   
    lstm_out = tf.concat([tf.squeeze(context_vector, 1), tf.squeeze(lstm_out, 1)], 1)


    #lstm_out shape (batch_size, hidden_units)
    lstm_out = self.wa(lstm_out)  #shape (batch_size, hidden_units)
    

    decoder_out = tf.expand_dims(self.fc(lstm_out),1)
    #decoder_out  shape (batch_size, 1, out_dims)

    return decoder_out, hidden_state, cell_state, attention_weights
    

In [None]:
class Encoder(Model):

  def __init__(self, embedded_layer, 
               hidden_units= 512, dropout=0.3):
    super(Encoder,self).__init__()
    self.embedded_layer = embedded_layer
    self.hidden_units = hidden_units

    self.lstm1_forward = layers.LSTM(hidden_units,
                                     return_sequences=True,
                                     dropout=dropout)
    self.lstm1_backward = layers.LSTM(hidden_units,
                                      return_sequences=True,
                                      dropout=dropout,
                                      go_backwards=True)
    
    self.layer1 = layers.Bidirectional(self.lstm1_forward,
                                       backward_layer = self.lstm1_backward)   #1st layer
    
    self.lstm2_forward = layers.LSTM(hidden_units,
                                     return_sequences=True,
                                     return_state=True,
                                     dropout=dropout)
    
    self.lstm2_backward = layers.LSTM(hidden_units,
                                      return_sequences=True,
                                      return_state=True,
                                      dropout=dropout,
                                      go_backwards=True)
    
    self.layer2 = layers.Bidirectional(self.lstm2_forward,
                                       backward_layer=self.lstm2_backward,
                                       merge_mode="sum")     #2nd layer
    
  def __call__(self, x):
    x = self.embedded_layer(x)
    x = self.layer1(x)  
    # x shape is (batch_size, time_steps, 2*hidden_units)

    encoder_output,forward_h, forward_c, backward_h, backward_c  = self.layer2(x)

    hidden_state = forward_h + backward_h 
    memory_state = forward_c + backward_c

    encoder_states = [hidden_state, memory_state]

    # encoder_output shape is (batch_size, time_steps, hidden_units)
    # hidden_state, memory_state shape is (batch_size, hidden_units)

    return encoder_output, encoder_states
  
  def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.hidden_units]),
                tf.zeros([batch_size, self.hidden_units]))
  

In [None]:
def load_txt():
  PATH = '/content/gdrive/My Drive/Seq2Seq-DD-RNN/sanskrit_sandhi_corpus-master'

  inp = np.loadtxt(PATH+"/combined_inp.txt", dtype="object").reshape(-1,1)

  with open(PATH+"/combined_out.txt") as f:
    out = f.readlines()
  
  with open(PATH+"/sandhi_pos.txt") as f:
    sandhi = f.readlines()
  
  examples = len(out)
  for i in range(examples):
    out[i] = "S"+out[i].strip("\n")+"E"
  
  for i in range(examples):
    sandhi[i] = sandhi[i].strip("\n")

  out = np.array(out, dtype="object").reshape(-1,1)
  sandhi_pos = np.array(sandhi, dtype ="object") .reshape(-1,1) 

  return inp,out,sandhi_pos, examples





In [None]:
def load_data_split_gen(input_data, out_data, 
                        enc_len, decd_len,
                        vocab_size, batch_size=64):
  ## a generator for generating input for encoder and input, output for decoder to predict final sequence.
  ## input_data - encoder input shape (examples,1)
  ## out_data -   decoder output  shape (examples,1)
  ## enc_len -  encoder longest sequence length
  ## decd_len - decoder longest sequence length
  
  examples = input_data.shape[0]
  steps = examples//batch_size #number of batches
  char_to_index, index_to_char = char_index_map() 

  while True:

    for i in range(steps):
      index = [j for j in range(i*batch_size, (i+1)*batch_size)]

      np.random.shuffle(index)
      temp_inp = input_data[index]
      temp_size = len(index)


      # encoder input is a 2D matrix  of shape (number of examples, enc_len)
      encoder_inp = np.zeros((temp_size,enc_len),dtype=int)

      for i in range(temp_size):
        m = len(temp_inp[i][0])
        for j in range(m):
          if m >= enc_len:
             break
          encoder_inp[i,j] = char_to_index[temp_inp[i][0][j]]
        
      # decoder input is a  2D matrix of shape (number of examples, decoder length)
      decoder_inp = np.zeros((temp_size, dec_len),dtype=int)

      # decoder output is a 3D matrix of shape( number of examples, decoder length, vocab size)
      decoder_out = np.zeros((temp_size, dec_len, vocab_size))
 
      temp_out =  out_data[index]
      for i in range(temp_size):
        m = len(temp_out[i][0])
        for j in  range(m):
          if m >= enc_len:
             break
          decoder_inp[i,j] = char_to_index[temp_out[i][0][j]]

          # decoder output is one time step ahead of decoder input
          if j > 0:
              decoder_out[i,j-1,char_to_index[temp_out[i][0][j]]-1] = 1
      
      yield [encoder_inp, decoder_inp], decoder_out
    
    #shuffle the data
    permute = np.random.permutation(examples)
    input_data = input_data[permute]
    out_data =  out_data[permute]


In [None]:
def gen_sandhi_pos_data(input_data, out_data,
                        output, enc_len, 
                        decd_len, vocab_size, 
                        batch_size=64):

  # a generator for generating input for encoder and input, output for decoder to predict sandhi split position.
  # input_data - encoder input shape (examples,1)
  # out_data -   decoder output of sandhi position shape(examples,1)
  # output - sandhi splitted examples shape (examples, 1)
  # enc_len -  encoder longest sequence length
  # decd_len - decoder longest sequence length

  examples = input_data.shape[0]
  steps = examples//batch_size #number of batches
  char_to_index, index_to_char = char_index_map() 

  while True:
    for i in range(steps):
      index = [j for j in range(i*batch_size, (i+1)*batch_size)]

      np.random.shuffle(index)
      temp_inp = input_data[index]
      out = output[index]
      temp_size = len(index)

       # encoder input is a 2D matrix  of shape (number of examples, enc_len)
      encoder_inp = np.zeros((temp_size,enc_len),dtype=int)

      for i in range(temp_size):
        m = len(temp_inp[i][0])
        for j in range(m):
          if m >= enc_len:
             break
          encoder_inp[i,j] = char_to_index[temp_inp[i][0][j]]
      
      # decoder input is a  2D matrix of shape (number of examples, decoder length)
      decoder_inp = np.zeros((temp_size, decd_len),dtype=int)

      # decoder output is a 3D matrix of shape(number of examples, decoder length, vocab size)
      decoder_out = np.zeros((temp_size, decd_len, vocab_size),dtype=int)# vocab size is 4

      temp_out =  out_data[index]
      for i in range(temp_size):
        

        
        n = len(out[i][0]) #sandhi splitted output length
        decoder_inp[i][list(range(n+1))]=1
        decoder_inp[i][0] = 3 # add start token
        decoder_inp[i][n] = 4 # add end token
        if temp_out[i][0] == "":
          m = 0
        else:
          temp = list(map(int,temp_out[i][0].split(",")))
          m = len(temp)

        for j in  range(m):
          if m >= enc_len:
            break
          decoder_inp[i][int(temp[j])+1] = 2 

          # decoder output is one time step ahead of decoder input
         
          decoder_out[i,int(temp[j]),1] = 1
        decoder_out[i,n-1,3] = 1
      
      yield encoder_inp, decoder_inp, decoder_out
    
    #shuffle the data
    permute = np.random.permutation(examples)
    input_data = input_data[permute]
    out_data =  out_data[permute]
    out = out[permute]


In [None]:
def gen_val_sandhi_pos_data(input_data, out_data,
                        output, enc_len, 
                        decd_len, vocab_size, 
                        batch_size=64):

  # a generator for generating input for encoder and input, output for decoder to predict sandhi split position.
  # input_data - encoder input shape (examples,1)
  # out_data -   decoder output of sandhi position shape(examples,1)
  # output - sandhi splitted examples shape (examples, 1)
  # enc_len -  encoder longest sequence length
  # decd_len - decoder longest sequence length

  examples = input_data.shape[0]
  steps = examples//batch_size #number of batches
  char_to_index, index_to_char = char_index_map() 

  for i in range(steps):
    index = [j for j in range(i*batch_size, (i+1)*batch_size)]

    np.random.shuffle(index)
    temp_inp = input_data[index]
    out = output[index]
    temp_size = len(index)

      # encoder input is a 2D matrix  of shape (number of examples, enc_len)
    encoder_inp = np.zeros((temp_size,enc_len),dtype=int)

    for i in range(temp_size):
      m = len(temp_inp[i][0])
      for j in range(m):
        if m >= enc_len:
          break
        encoder_inp[i,j] = char_to_index[temp_inp[i][0][j]]
    
    # decoder input is a  2D matrix of shape (number of examples, decoder length)
    decoder_inp = np.zeros((temp_size, decd_len),dtype=int)

    # decoder output is a 3D matrix of shape(number of examples, decoder length, vocab size)
    decoder_out = np.zeros((temp_size, decd_len, vocab_size),dtype=int)# vocab size is 4

    temp_out =  out_data[index]
    for i in range(temp_size):
      

      
      n = len(out[i][0]) #sandhi splitted output length
      decoder_inp[i][list(range(n+1))]=1
      decoder_inp[i][0] = 3 # add start token
      decoder_inp[i][n] = 4 # add end token
      if temp_out[i][0] == "":
        m = 0
      else:
        temp = list(map(int,temp_out[i][0].split(",")))
        m = len(temp)

      for j in  range(m):
        if m >= enc_len:
          break
        decoder_inp[i][int(temp[j])+1] = 2 

        # decoder output is one time step ahead of decoder input
        
        decoder_out[i,int(temp[j]),1] = 1
      decoder_out[i,n-1,3] = 1
    
    yield encoder_inp, decoder_inp, decoder_out
  
  #shuffle the data
  permute = np.random.permutation(examples)
  input_data = input_data[permute]
  out_data =  out_data[permute]
  out = out[permute]


In [None]:
def gen_valid_data_split(input_data, out_data, 
                               enc_len, decd_len,
                               vocab_size, batch_size=64):
  ## a generator for generating input for encoder and input, output for decoder to predict final sequence.
  ## input_data - encoder input shape (examples,1)
  ## out_data -   decoder output  shape (examples,1)
  ## enc_len -  encoder longest sequence length
  ## decd_len - decoder longest sequence length
  
  examples = input_data.shape[0]
  steps = examples//batch_size #number of batches
  char_to_index, index_to_char = char_index_map() 

 
  for i in range(steps):
    index = [j for j in range(i*batch_size, (i+1)*batch_size)]

    np.random.shuffle(index)
    temp_inp = input_data[index]
    temp_size = len(index)


    # encoder input is a 2D matrix  of shape (number of examples, enc_len)
    encoder_inp = np.zeros((temp_size,enc_len),dtype=int)

    for i in range(temp_size):
      m = len(temp_inp[i][0])
      for j in range(m):
        if m >= enc_len:
          break
        encoder_inp[i,j] = char_to_index[temp_inp[i][0][j]]
      
    # decoder input is a  2D matrix of shape (number of examples, decoder length)
    decoder_inp = np.zeros((temp_size, dec_len),dtype=int)

    # decoder output is a 3D matrix of shape( number of examples, decoder length, vocab size)
    decoder_out = np.zeros((temp_size, dec_len, vocab_size))

    temp_out =  out_data[index]
    for i in range(temp_size):
      m = len(temp_out[i][0])
      for j in  range(m):
        if m >= enc_len:
          break
        decoder_inp[i,j] = char_to_index[temp_out[i][0][j]]

        # decoder output is one time step ahead of decoder input
        if j > 0:
            decoder_out[i,j-1,char_to_index[temp_out[i][0][j]]-1] = 1
    
    yield [encoder_inp, decoder_inp], decoder_out
  
  #shuffle the data
  permute = np.random.permutation(examples)
  input_data = input_data[permute]
  out_data =  out_data[permute]


In [None]:
@tf.function
def train_step(source_word, decoder_in,
               decoder_out, 
               ecnoder,decoder, entropy):
  loss = 0
  
  with tf.GradientTape() as tape:
    en_outputs, enc_states =  encoder(source_word)
    hidden_state, cell_state = enc_states

    # iterate through each time step
    for i in range(decoder_in.shape[1]):
      
      target_seq_in = decoder_in[:,i]
      
      states = [hidden_state, cell_state]
      target_out, hidden_state, cell_state, attention_weights =  decoder(target_seq_in,
                                                                         states, 
                                                                         en_outputs)
      
      #  accumulate loss for  each time step for the entire batch
      decoder_outs = tf.expand_dims(decoder_out[:,i,:],1)
      loss += entropy(decoder_outs, target_out)

    
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss/source_word.shape[0]


In [None]:
@tf.function
def test_step(source_word, decoder_in,
               decoder_out,
               ecnoder,decoder, entropy):
  
   loss = 0
  
   en_outputs, enc_states =  encoder(source_word)
   hidden_state, cell_state = enc_states

   # iterate through each time step
   for i in range(decoder_in.shape[1]):
    
     target_seq_in = decoder_in[:,i]
     
     target_out, hidden_state, cell_state, attention_weights =  decoder(x, states, en_outputs)
     
     decoder_outs = tf.expand_dims(decoder_out[:,i,:],1)
     #  accumulate loss for  each time step for the entire batch
     loss += entropy(decoder_outs, target_out)
  
   return loss/source_word.shape[0]



In [None]:
def train(train_data_gen, valid_data_gen, 
          encoder, decoder,            
          entropy, optimizer,
          steps_per_epoch, steps,
          EPOCHS=15, BATCH_SIZE=64,
          decay_factor = 0.5):

  val_loss_epoch = []

  for epoch in range(EPOCHS):
    
    # access the current learning rate using the optimizer
    lr = float(keras.backend.get_value(optimizer.learning_rate))

    if epoch < 2:
      print("Not enought vaues to compare")
         
    else:
      if val_loss_epoch[-2] <= val_loss_epoch[-1]:
        lr = lr *(1/(1+(epoch*decay_factor)))

        # set the new learning rate
        keras.backend.set_value(optimizer.lr, lr)
        
      else:
          print("Validation loss improved")
    
    
    start = time.time()
    print("\n Start of epochs: %d\n Learning Rate %.0f"%(epoch,lr))
    
    loss_value = 0
    i = 0
    for in_data, decoder_in,decoder_out in train_data_gen:
      
      print(f"Batch {i+1}")
      loss = train_step(in_data, decoder_in, 
                               decoder_out, 
                               encoder,decoder, entropy)
      loss_value += loss
      print(f"Batch  {i+1} loss: {loss}")
      i += 1
    
    print("Avg. train loss: %d"%(loss_value/steps_per_epoch))

    val_loss = 0
    for in_data, decoder_in, decoder_out in valid_data_gen:
      
      val_loss += test_step(in_data, decoder_in,
                            decoder_out,
                            encoder, decoder, entropy)
    
    
    val_loss_epoch.append(val_loss/steps)
    print("Avg. valid loss: %d"%val_loss_epoch[-1])

    end = time.time()
    print(f"Time taken: {end-start}")



In [None]:
def train_valid_split(inp, out, sandhi, examples, train_size=0.8):

    # get indices to split the data into train and  validation.
    np.random.seed(42)
    train_indices = set(np.random.choice(range(examples), int((0.8)*examples), replace = False).flatten().tolist())
    validation_indices = list(set(range(examples)) - train_indices)
    train_indices = list(train_indices)

    #shuffle the entire data
    np.random.seed(42)
    indices = np.random.permutation(examples)
    inp = inp[indices]
    out = out[indices]
    sandhi = sandhi[indices]

    inp_train = inp[train_indices]
    inp_valid = inp[validation_indices]

    sandhi_train = sandhi[train_indices]
    sandhi_valid = sandhi[validation_indices]

    out_train = sandhi[train_indices]
    out_valid = sandhi[validation_indices]

    return [(inp_train, sandhi_train, out_train), (inp_valid, sandhi_valid, out_valid)]




In [None]:
inp, out, sandhi, examples = load_txt()

enc_len = 1411
decd_len = 1400



In [None]:
data = train_valid_split(inp, out, sandhi, examples) #split the data into train and vlaidation set
inp_train, sandhi_train, out_train = data[0]
inp_valid, sandhi_valid, out_valid = data[1]

In [None]:
train_data_gen = gen_sandhi_pos_data(inp_train, sandhi_train,
                        out_train, enc_len, 
                        decd_len,4, 
                        batch_size=64)

valid_data_gen = gen_val_sandhi_pos_data(inp_valid, sandhi_valid,
                        out_valid, enc_len, 
                        decd_len, 4, 
                        batch_size=64)



In [None]:
EPOCHS = 15
BATCH_SIZE = 64
entropy = keras.losses.CategoricalCrossentropy()
optimizer = keras.optimizers.SGD(learning_rate=1.0)

steps_per_epoch = inp_train.shape[0]//BATCH_SIZE
steps = inp_valid.shape[0]//BATCH_SIZE

vocab_size = 131 #vocabulary size
out_dim = int(np.log2(vocab_size))
hidden_units = 512

embedded_layer = layers.Embedding(vocab_size+1, out_dim, mask_zero=True)
pos_embedded_layer = layers.Embedding(5, 4, mask_zero=True)

# create encoder
encoder = Encoder(embedded_layer) 

attention = LuongAttention(hidden_units)

position_decoder = Decoder(4, attention, pos_embedded_layer)

train(train_data_gen, valid_data_gen, 
          encoder, position_decoder,            
          entropy, optimizer,
          steps_per_epoch, steps,
          EPOCHS=15, BATCH_SIZE=64,
          decay_factor = 0.5)


Not enought vaues to compare

 Start of epochs: 0
 Learning Rate 1
Batch 1
