In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/My Drive/Colab/NMT'

/content/drive/My Drive/Colab/NMT


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense,LSTM,Bidirectional,GRU,Embedding,Input,Flatten,Dropout,TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.models import Model,load_model,Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pickle

In [4]:
cleaned_df = pd.read_csv('cleaned_df.csv')

In [5]:
cleaned_df.shape

(324562, 2)

In [6]:
total_size = len(cleaned_df)
drop_rows = [70892]
for ind in range(total_size):
  if(ind!=70892):
    if(len(cleaned_df['english'][ind].split())>20 or len(cleaned_df['hindi'][ind].split())>20):
        drop_rows.append(ind)

In [7]:
cleaned_df['english'][70892]

nan

In [8]:
cleaned_df.isna().sum()

english    1
hindi      0
dtype: int64

In [9]:
cleaned_df.drop(drop_rows,axis=0,inplace=True)
cleaned_df = cleaned_df.reset_index().drop('index',axis=1)

In [10]:
cleaned_df.shape

(248833, 2)

In [11]:
df = cleaned_df.copy()

In [12]:
cleaned_df = df.loc[0:50000-1]

In [13]:
cleaned_df.shape

(50000, 2)

In [14]:
eng_vocab=[]
hin_vocab=[]

for i in range(len(cleaned_df)):
    eng_text = str(cleaned_df['english'][i])
    eng_text = eng_text.split()
    for word in eng_text:
        if word not in eng_vocab:
            eng_vocab.append(word)

for i in range(len(cleaned_df)):
    hin_text = cleaned_df['hindi'][i]
    hin_text = hin_text.split()
    for word in hin_text:
        if word not in hin_vocab:
            hin_vocab.append(word)

In [15]:
print(len(eng_vocab))
print(len(hin_vocab))

30246
34680


In [16]:
max_eng_len = 0
max_hin_len = 0

for i in range(len(cleaned_df)):
    eng_text = str(cleaned_df['english'][i])
    if(len(eng_text.split())>max_eng_len):
        max_eng_len = len(eng_text.split())

for i in range(len(cleaned_df)):
    hin_text = cleaned_df['hindi'][i]
    hin_text = hin_text.split()
    if(len(hin_text)>max_hin_len):
        max_hin_len = len(hin_text)

In [17]:
print(max_eng_len)
print(max_hin_len)

20
20


In [18]:
eng_tokens = {}
rev_eng_tokens = {}
for ind,word in enumerate(eng_vocab):
    eng_tokens[ind+1]=word
    rev_eng_tokens[word]=ind+1
    
hin_tokens = {}
rev_hin_tokens = {}
for ind,word in enumerate(hin_vocab):
    hin_tokens[ind+1]=word
    rev_hin_tokens[word]=ind+1

In [19]:
encoder_tokens = len(eng_vocab)+1
decoder_tokens = len(hin_vocab)+1
input_words = len(eng_vocab)
target_words = len(hin_vocab)

In [20]:
print(encoder_tokens,decoder_tokens)

30247 34681


In [21]:
english_lines = cleaned_df['english']
hindi_lines = cleaned_df['hindi']

X_train,X_val,y_train,y_val = train_test_split(english_lines,hindi_lines)

In [22]:
def batch_generator(X = X_train,y = y_train,batch_size=32):

  while True:
    for j in range(0,len(X_train),batch_size):
      encoder_input = np.zeros((batch_size,max_eng_len),dtype='int32')
      decoder_input = np.zeros((batch_size,max_hin_len),dtype='int32')
      decoder_output = np.zeros((batch_size,max_hin_len,decoder_tokens),dtype='int32')   #sab jagah float ko int kiye h
      for i,(input_text,tar_text) in enumerate(zip(X_train[j:j+batch_size],y_train[j:j+batch_size])):
        for t,word in enumerate(input_text.split()):
          encoder_input[i,t] = rev_eng_tokens[word]
        for t,word in enumerate(tar_text.split()):
          if(t<len(tar_text.split())-1):
            decoder_input[i,t] = rev_hin_tokens[word]
          if(t>0):
            decoder_output[i,t,rev_hin_tokens[word]] = 1
      yield([encoder_input, decoder_input], decoder_output)

In [27]:
hidden_dim = 1024
emb_dim = 200
dropout_rate=0.2

encoder_inputs = Input(shape=(None,))
emb_vec = Embedding(input_dim=encoder_tokens,output_dim=emb_dim,mask_zero=True)(encoder_inputs)
enc_dropout = (TimeDistributed(Dropout(rate = dropout_rate)))(emb_vec)
lstm1 = LSTM(hidden_dim,activation='relu',return_sequences=True)(enc_dropout)
lstm2 = LSTM(hidden_dim,activation='relu',return_sequences=True)(lstm1)
lstm3 = LSTM(hidden_dim,activation='relu',return_state=True)
encoder_outputs,state_h,state_c = lstm3(lstm2)
encoder_states=[state_h,state_c]



In [28]:
decoder_inputs = Input(shape=(None,))
dec_emb_vec = Embedding(input_dim=decoder_tokens,output_dim=emb_dim,mask_zero=True)
dec_emb = dec_emb_vec(decoder_inputs)
dec_dropout = (TimeDistributed(Dropout(rate=dropout_rate)))(dec_emb)
dec_lstm1 = LSTM(hidden_dim,activation='relu',return_sequences=True)
dec_lstm_1 = dec_lstm1(dec_dropout,initial_state=encoder_states)
dec_lstm2 = LSTM(hidden_dim,activation='tanh',return_sequences=True,return_state=True)

decoder_outputs,_,_ = dec_lstm2(dec_lstm_1)

decoder_dense = Dense(decoder_tokens,activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs,decoder_inputs],decoder_outputs)



In [29]:
model.summary()
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    6049400     input_3[0][0]                    
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, None, 200)    0           embedding_2[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
____________________________________________________________________________________________

In [30]:
train_samples = len(X_train)
val_samples = len(X_val)
batch_size = 64
epochs = 10

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=1e-3), metrics=['acc'])
model.fit_generator(generator = batch_generator(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = batch_generator(X_val, y_val, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f95856073d0>

In [None]:
model.save_weights('nmt_weights.h5')

In [None]:
model.load_weights('nmt_weights.h5')

In [None]:
encoder_model = Model(encoder_inputs,encoder_states)

decoder_state_input_h = Input(shape=(hidden_dim,))
decoder_state_input_c = Input(shape=(hidden_dim,))
decoder_state_inputs = [decoder_state_input_h,decoder_state_input_c]

dec_emb_2 = dec_emb_vec(decoder_inputs)
dec_dropout2 = TimeDistributed(Dropout(rate=dropout_rate))(dec_emb_2)
dec_LSTM2 = dec_lstm1(dec_dropout2,initial_state=decoder_state_inputs)
decoder_outputs2,state_h2,state_c2 = dec_lstm2(dec_LSTM2)

decoder_states2 = [state_h2,state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs]+decoder_state_inputs,[decoder_outputs2]+decoder_states2)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = rev_hin_tokens['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    decoded_sentence = ''
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, 0, :])  #-1 ko 0 kiye hai (hmko lg rha time hoga second parameter like t=0 to 50 type will not a constant like 0 or -1)
        sampled_char = hin_tokens[sampled_token_index]

        # Exit condition: either hit max length
        # or find stop character.
        if(sampled_char == '_END' or len(decoded_sentence) > 20):
            break

        decoded_sentence += (' '+sampled_char)
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
train_gen = batch_generator(X_train, y_train, batch_size = 1)
k=-1

In [None]:
val_gen = batch_generator(X_val, y_val, batch_size = 1)
k=-1

In [None]:
train_gen = batch_generator(X_train, y_train, batch_size = 1)
actual, predicted = list(), list()
for k in range(len(y_train)):
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    actual.append(y_train[k:k+1].values[0][6:-4].split())
    predicted.append(decoded_sentence[:-4].split())

KeyboardInterrupt: ignored

In [None]:
hin_tokens[0] = ""

In [None]:
(input_seq, actual_output), _ = next(train_gen)

In [None]:
decoded_sentence = decode_sequence(input_seq)

In [None]:
decoded_sentence

' हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं'

In [None]:
(input_seq, actual_output), _ = next(val_gen)

In [None]:
input_seq

array([[ 9852., 20422.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.]], dtype=float32)

In [None]:
actual_output

array([[1.0000e+00, 2.3187e+04, 2.3188e+04, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]],
      dtype=float32)

In [None]:
decoded_sentence = decode_sequence(input_seq)

In [None]:
decoded_sentence

' हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं हैं'