# ENCODER DECODER MODEL FOR WORD LEVEL EMBEDDING

In [None]:
## LOADING THE REQUIRED LIBRARIES

import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from tqdm import tqdm 
import tensorflow as tf
from  tensorflow.keras.preprocessing.sequence import pad_sequences
from  sklearn.model_selection import train_test_split
from tqdm import tqdm

## Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## LOADING THE PROCESSED DATASET

df= pd.read_csv("/content/drive/MyDrive/ColabNotebooks/cs2/processed_data.csv")
df.columns = ["enc_input","dec_input","y"] 
df["dec_output"] = df.dec_input

## Adding start and end token

In [None]:
## THE INPUTS TO THE DECODER REQUIRES SPECIAL TOKENS FOR THE START AND THE END SO WE ARE GOING TO USE 
## <start> AS BEGINING TOKEN
## <end>  AS END TOKEN

df["dec_input"]= "<start> " + df["dec_input"]
df["dec_output"] =  df["dec_output"] + " <end>" 

## Splitting And Sampling around 100k datapoints

---
##### THE TOTAL DATASET HAS 500K DATAPOINTS WHICH WILL TAKE MUCH HIGHER TRAINING TIME. THEREFORE I AM SAMPLING ONE-FIFTH OF THE TOTAL DATASET



In [None]:
df_sampled = pd.concat((df[df.y==1].sample(frac= 0.2,random_state=1),df[df.y==2]))

In [None]:
## ONCE THE DATA IS SAMPLED WE ARE SPLITTIND THE DATA IN TO TRAIN AND TEST

df_train ,df_val = train_test_split(df_sampled,test_size=0.2,random_state = 3, stratify = df_sampled.y )

In [None]:
## IN THE COLUMN WHICH HAS DECODER INPUTS ADDING "<end>" TOKEN TO BE LEARNED BY THE TOKENIZER

df_train["dec_input"].iloc[0]  = df_train.iloc[0]["dec_input"] + " <end>"

In [None]:
## HERE I AM SAMPLING 1000 POINTS FROM THE DATAFRAME AS TEST DATA WHICH ARE NOT PRESEENT IN THE TRAIN AND VALIDAION DATA
np.random.seed(5) 
df_test = df.loc[np.random.choice(np.array([x for x in df.index.values if x not in df_sampled.index.values]),1000,replace= False,)]

## Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
## TOKENIZER FOR ENCODER INPUT
tk_inp = Tokenizer()
tk_inp.fit_on_texts(df_train.enc_input.apply(str))

In [None]:
# TOKENIZER FOR DECODER INPUT
tk_out = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' )
tk_out.fit_on_texts(df_train.dec_input.apply(str))

## Dataset Loader

In [None]:
## THIS CLASS CONVERTS TEXT DATA TO INTEGER SEQUENCES AND RETURNS THE PADDED SEQUENCES
class Dataset :
    def __init__(self, data , tk_inp ,tk_out, max_len):
        self.encoder_inp = data["enc_input"].apply(str).values
        self.decoder_inp = data["dec_input"].apply(str).values
        self.decoder_out = data["dec_output"].apply(str).values
        self.tk_inp = tk_inp
        self.tk_out = tk_out
        self.max_len = max_len
        
    def __getitem__(self,i):
        # INPUT SEQUENCES
        self.encoder_seq = self.tk_inp.texts_to_sequences([self.encoder_inp[i]])
        # DECODER INPUT SEQUENCES 
        self.decoder_inp_seq = self.tk_out.texts_to_sequences([self.decoder_inp[i]])
        # DECODER INPUT SEQUENCES
        self.decoder_out_seq = self.tk_out.texts_to_sequences([self.decoder_out[i]])
        
        # PADDING THE ENCODER INPUT SEQUENCES
        self.encoder_seq = pad_sequences(self.encoder_seq, padding="post",maxlen = self.max_len)
        # PADDING THE DECODER INPUT SEQUENCES
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, padding="post",maxlen = self.max_len)
        # PADDING DECODER OUTPUT SEQUENCES
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq ,padding="post", maxlen = self.max_len)
        ##  RETURNING THE ENCODER INPUT , DECODER INPUT , AND DECODER OUTPUT
        return self.encoder_seq ,  self.decoder_inp_seq,  self.decoder_out_seq
    
    def __len__(self):
        # RETURN THE LEN OF INPUT ENDODER
        return len(self.encoder_inp)

In [None]:
## THIS CLASS CONVERTES THE DATASET INTO THE REQUIRED BATCH SIZE

class Dataloader(tf.keras.utils.Sequence):
    def __init__(self,batch_size,dataset):
        # INTIALIZING THE REQUIRED VARIABLES 
        self.dataset = dataset
        self.batch_size = batch_size
        self.totl_points = self.dataset.encoder_inp.shape[0]
        
    def __getitem__(self,i):
        # STATING THE START AND STOP VATIABLE CONTAINGING INDEX VALUES FOR EACH BATCH
        start = i * self.batch_size
        stop = (i+1)*self.batch_size
        
        # PLACEHOLDERS FOR BATCHED DATA
        batch_enc =[]
        batch_dec_input = []
        batch_dec_out =[]

        for j in range(start,stop): 
            
            a,b,c = self.dataset[j] 
            batch_enc.append(a[0]) 
            batch_dec_input.append(b[0])
            batch_dec_out.append(c[0]) 
        
        # Conveting list to array   
        batch_enc = (np.array(batch_enc)) 
        batch_dec_input = np.array(batch_dec_input)
        batch_dec_out = np.array(batch_dec_out)
        
        return [batch_enc , batch_dec_input],batch_dec_out
    
    def __len__(self):
        # Returning the number of batches
        return int(self.totl_points/self.batch_size)

###### WE ARE TAKING THE MAXIMUM LENGHT EQUAL TO 35 WHICH IS 99 PERCENTILE OF THE WORD LENGTH DISTRUBUTION

In [None]:
# FORMING OBJECTS OF DATASET AND DATALOADER FOR TRAIN DATASET
train_dataset = Dataset(df_train,tk_inp,tk_out,35)
train_dataloader = Dataloader( batch_size = 512, dataset=train_dataset)

In [None]:
# FORMING OBJECTS OF DATASET AND DATALOADER FOR VALIDATION DATASET
val_dataset = Dataset(df_val , tk_inp,tk_out,35)
val_dataloader = Dataloader(batch_size=512 , dataset=val_dataset)

## Encoder Decoder Model

In [None]:
## LOADING THE TENSORFLOW LIBRARIES
from tensorflow.keras import layers
from tensorflow.keras import Model

In [None]:
## ENDOCER CLASS
class Encoder(tf.keras.layers.Layer):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''
    def __init__(self , vocab_size , embedding_dim , enc_units , input_len):
        super().__init__()
        
        # STATING ALL THE VARIABLES
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.input_len = input_len
        self.enc_units = enc_units
        
        # INITALIZING EMBEDDING LAYER
        self.embedding = layers.Embedding(input_dim= self.vocab_size,
                                         output_dim = self.embedding_dim,
                                         mask_zero = True,
                                          input_length = self.input_len
                                         )
        # INTIALIZING LSTM LAYER
        self.lstm_bi = layers.Bidirectional(layers.LSTM(units= self.enc_units,return_state = True,return_sequences=True ))
        self.concat1 = layers.Concatenate()
        self.concat2 = layers.Concatenate()
    def call(self,input):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- encoder_output, last time step's hidden and cell state
        '''
        # CONVERTING INPUT TO EMBEDDED VECTORS
        emb = self.embedding(input)
        # PASSING THROUGH LSTM LAYER
        enc_output , state_h1 , state_c1 ,state_h2 , state_c2 = self.lstm_bi(emb)
        state_h = self.concat1([state_h1,state_h2])
        state_c  = self.concat2(([state_c1,state_c2]))

        return enc_output ,state_h ,state_c 

## DECODER CLASS
class Decoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size , embedding_dim, dec_unit,input_len ):
        super().__init__()
        # INITALIZING ALL THE VARIABLES 
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.input_len = input_len
        self.dec_unit =dec_unit
        
        
    def build(self,input_shape):
        
        # INITALIZING EMBEDDING AND LSTM LAYER
        self.embedding = layers.Embedding(input_dim = self.vocab_size,
                                          output_dim = self.embedding_dim,
                                         mask_zero=True,
                                         input_length = self.input_len)
        self.lstm = layers.LSTM(units = self.dec_unit,
                               return_sequences=True,
                               return_state=True)
        
    def call(self,input, state):
        # FORMING THE EMBEDDED VECTORS
        emb = self.embedding(input)
        
        # LSTM OUTPUT
        dec_out,state_h,state_c = self.lstm(emb,initial_state = state)
        
        return dec_out,state_h,state_c
  

In [None]:
## DEFINING THE ARCHITECTURE

# INPUT LAYER
enc_input = layers.Input(shape=(35))
# ENCODER 
enc_output,state_h,state_c =   Encoder(vocab_size= len(tk_inp.word_index)+1  , embedding_dim= 300 ,
                               enc_units=256 ,input_len=35)(enc_input)
## STORING ENCOER STATES IN A VARIABLE
enc_state = [state_h,state_c] 

# INPUT LAYER FOR DECODER
dec_input = layers.Input(shape = (35))
## DECODER LAYER
dec_output,_,_ = Decoder(vocab_size = len(tk_out.word_index)+1  , embedding_dim = 300,
                               dec_unit=512,input_len=35)(dec_input,enc_state)

# DENSE LAYER
dense = layers.Dense(len(tk_out.word_index)+1,activation="softmax")(dec_output)

# MODEL DEFINING
model  = Model(inputs=[enc_input,dec_input],outputs=dense)                           

In [None]:
## DEFINING THE CALLBACKS

callback =[ tf.keras.callbacks.ModelCheckpoint( "/content/drive/MyDrive/Colab Notebooks/cs2/model_save/bidirectional_train_emb/besh.h5",save_best_only=True,mode="min" ,save_weights_only=True),
           tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5,verbose=1,min_delta=0.0001),
            tf.keras.callbacks.TensorBoard("/content/drive/MyDrive/Colab Notebooks/cs2/model_save/bidirectional_train_emb/logs/save",histogram_freq=1)
]
## TRAINNG AND VALIDATION STEPS FOR ONE EPOCH
train_steps = train_dataloader.__len__()
val_steps  = val_dataloader.__len__()

# COMPILING THE MODEL
model.compile(optimizer="adam",loss='sparse_categorical_crossentropy')

In [None]:
history = model.fit(train_dataloader,steps_per_epoch=train_steps,epochs=50,validation_data = val_dataloader,validation_steps =val_steps,callbacks=callback)

In [None]:
### LOADING THE BEST WEIGHTS
model.load_weights("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/bidirectional_train_emb/besh.h5")

In [None]:
## THIS FUNCTION IS USED IN THE INFERENCE TIME TO PREDICT THE RESULTS GIVEN THE INPUT TEXT

def predict(ita_text,model):
    
    # forming integer sequences
    seq = tk_inp.texts_to_sequences([ita_text])
    # padding the sequences
    seq = pad_sequences(seq,maxlen = 20 , padding="post")
    
    # generating the output from encoder
    enc_output,state_h,state_c= model.layers[2](seq)
    
    # placeholder for predicted output
    pred = []
    
    input_state = [state_h,state_c]
    # initailizing the vector for inputing to decoder
    current_vec = tf.ones((1,1))
    
    for i in range(20): # for each word in the input
        # passing each word through decoder layer
        dec_output,dec_state_h,dec_state_c = model.layers[3](current_vec , input_state)
        # passing decoder output through dense  layer
        dense = model.layers[4](dec_output)
        # taking argmax and getting the word index and updating the current vector
        current_vec = np.argmax(dense ,axis = -1)
        # updating the decoder states
        input_state = [dec_state_h,dec_state_c]
        # getting the actual word from the vocab
        pred.append(tk_out.index_word[current_vec[0][0]])
        
        # if the actual word is <end> break the loop
        if tk_out.index_word[current_vec[0][0]]=="<end>":
            break
        
    return " ".join(pred)

In [None]:
## IMPORTING THE BLUE SCORE
import nltk.translate.bleu_score as bleu

In [None]:
# BELU SCORE
BLEU = []
np.random.seed(1)
test_data = df_val.loc[np.random.choice(df_val.index,size = 2000,replace=False)]
for ind,i in tqdm(test_data.iterrows(),position=0):
    try:
        pred = predict(str(i.enc_input),model).split()
        act = [str(i.dec_output).split()]
        b = bleu.sentence_bleu(act,pred)
        BLEU.append(b)
    except:
      continue
print("BELU = ", np.mean(BLEU))

## PREDICTIONS ON TEST DATASET

In [None]:
print("INPUT SENTENCE ===> ",df_test.enc_input.values[19])
print("PREDICTED SENTENCE ===> ",predict(df_test.enc_input.values[19],model))
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[19])

In [None]:
print("INPUT SENTENCE ===> ",df_test.enc_input.values[50])
print("PREDICTED SENTENCE ===> ",predict(df_test.enc_input.values[50],model))
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[50])

## INFERENCE TIME

In [None]:
%%time
predict(df_test.enc_input.values[50],model)

In [None]:
def beam_search(input,model,k):
    seq = tk_inp.texts_to_sequences([input])
    seq = pad_sequences(seq,maxlen = 35,padding="post")

    
    enc_gru ,enc_state_h,enc_state_c   = model.layers[2](seq)

    input_state = [enc_state_h,enc_state_c ]
    
    k_beams = [[tf.ones((1,1),dtype=tf.int32),0.0]]
    for i in range(35):
        candidates = []
        for sent_pred , prob in k_beams :
            if tk_out.word_index["<end>"] in sent_pred.numpy() :

                candidates += [[sent_pred , prob]]
            else:
                
                dec_gru , dec_state_h ,dec_state_c = model.layers[3](sent_pred , input_state)
                dense = model.layers[4](tf.expand_dims(dec_state_h,axis=0))
                pred = tf.argsort(dense, direction= 'DESCENDING')[:,:,:k]
                for w in range(k):
                  candidates += [[tf.concat((sent_pred, pred[:,:,w]) , axis=-1) , (prob + tf.math.log(dense[:,:,pred[:,:,w][0][0]])[0][0])]  ]
        k_beams = sorted(candidates,key=lambda tup:tup[1],reverse=True)[:k]

    all_sent = []
    for i,score in k_beams:
        sent = ""
        for j in range(1,35):
            sent +=  tk_out.index_word[i.numpy()[:,j][0]] +  " " 
            if tk_out.index_word[i.numpy()[:,j][0]] =="<end>":
                break
        all_sent.append((sent.strip(),score.numpy()))
    return all_sent

In [None]:
# VALIDATION BELU SCORE
BLEU_beam = []
np.random.seed(1)
test_data = df_val.loc[np.random.choice(df_val.index,size = 2000,replace=False)]
for ind,i in tqdm(test_data.iterrows(),position=0):

    try:
        pred = beam_search(str(i.enc_input),model,3)[0][0].split()
        act = [str(i.dec_output).split()]
        b =bleu.sentence_bleu(act,pred)
        BLEU_beam.append(b)
    except:
          continue

print("BELU Score = ",np.mean(BLEU_beam))  

In [None]:
print("INPUT SENTENCE ===> ",df_test.enc_input.values[19])
print("="*50)
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[19])
print("="*50)
print("BEAM SEARCH OUTPUT ,  SCORE")
bm = (beam_search(df_test.enc_input.values[19],model,3))
for i in bm:
    print(i)

In [None]:
print("INPUT SENTENCE ===> ",df_test.enc_input.values[50])
print("="*50)
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[50])
print("="*50)
print("BEAM SEARCH OUTPUT ,  SCORE")
bm = (beam_search(df_test.enc_input.values[50],model,3))
for i in bm:
    print(i)

## Model Comparison

In [None]:
s1 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/char_trainable_embedding/besh.h5")
s2 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/word_trainable_embedding/besh.h5")
s3 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/word_w2v/besh.h5")
s4 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/word_ft/besh.h5")
s5 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/bidirectional_train_emb/besh.h5")
s6 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/multi_layered_word/besh.h5")
s7 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/attention_dot/besh.h5")
s8 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/attention_gernal/best.h5")
s9 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/attention_concat/best.h5")
s10 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/monitonic_attention_dot/best.h5")
s11 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/monotonic_attention_general/best.h5")
s12 = os.path.getsize("/content/drive/MyDrive/ColabNotebooks/cs2/model_save/monotonic_attention_concat/best.h5")


In [None]:
df_comp = pd.DataFrame()
df_comp["Model"] = ["Encoder Decoder(Char Level)","Encoder Decoder","Encoder Decoder","Encoder Decoder","Bidirectional Encoder Decoder","Multilayered Encoder Decoder","Attention Dot Model","Attention Gernal Model","Attention Concat Model","Monotonic Attention Dot Model","Monotonic Attention Gernal Model","Monotonic Attention Concat Model"]
df_comp["Embedding"] = ["One Hot Encoding","Trainable Embedding" , "Pretrained Word2Vec " ,"Fast Text","Trainable Embedding","Trainable Embedding","Trainable Embedding","Trainable Embedding","Trainable Embedding","Trainable Embedding","Trainable Embedding","Trainable Embedding"]
df_comp["BLEU Score(Greedy Search)"] = [0.3139,0.4603,0.4453,0.4569,0.4509,0.4527, 0.5055,0.5545,0.5388,0.5469,0.5514,0.5348]
df_comp["BLEU Score(Beam Search)"] = ["-","-","-","-",0.4561,0.4557,0.5411,0.5324,0.5671,"-","-","-"]
df_comp["Model Size(bytes)"] = [ s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12]
df_comp["Model Parameters"] = ["616,488	","26,363,578" , "8,158,378", "8,158,378","35,018,938" ," 28,464,826","33,353,914","33,419,706","33,485,755","33,353,914","33,419,706","33,485,755"]
df_comp["Inference Time(ms)"] = [143,92.3 , 94.5 , 93.7,250,311,157,164,189,164,179,176]
df_comp