In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from plot_keras_history import plot_history
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM


In [3]:
with open('english-corpus.txt', 'r') as file:
    lines = file.readlines()


english = pd.DataFrame({'English-Sentences': lines})
english['English-Sentences'] = english['English-Sentences'].str.rstrip('\n')

In [4]:
english.head(10)

Unnamed: 0,English-Sentences
0,is zain your nephew
1,i wish youd trust me
2,did he touch you
3,its part of life
4,zain isnt ugly
5,above all be patient
6,i learned it from him
7,why am i doing this
8,i made a bad decision
9,zain wont care


In [5]:
with open('urdu-corpus.txt', 'r',encoding='utf-8') as file:
    lines = file.readlines()


urdu = pd.DataFrame({'Urdu-Sentences': lines})
urdu['Urdu-Sentences'] = urdu['Urdu-Sentences'].str.rstrip('\n')

In [6]:
urdu.head(5)

Unnamed: 0,Urdu-Sentences
0,زین تمہارا بھتیجا ہے۔
1,کاش تم مجھ پر بھروسہ کرتے
2,کیا اس نے آپ کو چھوا؟
3,اس کی زندگی کا حصہ
4,زین بدصورت نہیں ہے۔


In [7]:
data=pd.concat([english,urdu],axis=1)

In [8]:
data.head(20)

Unnamed: 0,English-Sentences,Urdu-Sentences
0,is zain your nephew,زین تمہارا بھتیجا ہے۔
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے
2,did he touch you,کیا اس نے آپ کو چھوا؟
3,its part of life,اس کی زندگی کا حصہ
4,zain isnt ugly,زین بدصورت نہیں ہے۔
5,above all be patient,سب سے بڑھ کر صبر کرو
6,i learned it from him,میں نے اسے اس سے سیکھا۔
7,why am i doing this,میں یہ کیوں کر رہا ہوں
8,i made a bad decision,میں نے ایک برا فیصلہ کیا
9,zain wont care,زین پرواہ نہیں کرے گا


**PERFORMING WORD COUNT AND KEEPING ONLY THOSE HAVING WORDS LESS THAN 8**

In [9]:
# Function for word count
def word_count (txt):
    return len(txt.split())

In [10]:
data['English-Words']=data['English-Sentences'].apply(lambda x: word_count(x))
data['Urdu-Words']=data['Urdu-Sentences'].apply(lambda x: word_count(x))

In [11]:
data.shape

(24525, 4)

In [12]:
filtered_data = data[(data['English-Words'] < 8) & (data['Urdu-Words'] < 8)]
filtered_data.head(5)

Unnamed: 0,English-Sentences,Urdu-Sentences,English-Words,Urdu-Words
0,is zain your nephew,زین تمہارا بھتیجا ہے۔,4,4
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے,5,6
2,did he touch you,کیا اس نے آپ کو چھوا؟,4,6
3,its part of life,اس کی زندگی کا حصہ,4,5
4,zain isnt ugly,زین بدصورت نہیں ہے۔,3,4


In [13]:
filtered_data.shape

(22969, 4)

**SPLITTING THE DATA INTO TRAIN AND TESTING**

In [14]:
train,test=train_test_split(filtered_data,test_size=0.2,random_state=42)

**ADDING END OF LINE IN MY TRAIN TEST DATA**

In [15]:
def adding_EOL(lines):
    text=[]
    for line in lines:
        text.append('<start> ' +line+ ' <end>')
    return text

ADDING EOL TO TRAIN DATA

In [16]:
english_train=adding_EOL(list(train['English-Sentences']))
urdu_train=adding_EOL(list(train['Urdu-Sentences']))

ADDING EOL TO TEST DATA

In [17]:
english_test=adding_EOL(list(test['English-Sentences']))
urdu_test=adding_EOL(list(test['Urdu-Sentences']))

**TOKENIZING THE TRAIN DATA**

In [18]:
def Tokenize_fn(X):
    tokenizer=Tokenizer(filters='',lower=False)
    tokenizer.fit_on_texts(X)
    vocab_size=len(tokenizer.word_index)+1
    return tokenizer.texts_to_sequences(X),vocab_size,tokenizer

In [19]:
def pad_fn(X,length=None):
    return pad_sequences(X, maxlen = length, padding = 'post')

In [20]:
# TOKENIZING ENGLISH TRAIN
english_train_tokenized,num_ip_tokens,english_train_tok=Tokenize_fn(english_train)
english_train=pad_fn(english_train_tokenized)
# TOKENIZING URDU TRAIN
urdu_train_tokenized,num_op_tokens,urdu_train_tok=Tokenize_fn(urdu_train)
urdu_train=pad_fn(urdu_train_tokenized)


In [21]:
max_len_ip=english_train.shape[1]
max_len_op=urdu_train.shape[1]

In [22]:
num_ip_tokens

4900

**TOKENIZING TESTING DATA**

In [23]:
# TOKENIZING ENGLISH TRAIN
english_test_tokenized,num_ip_tokens_test,english_test_tok=Tokenize_fn(english_test)
english_test=pad_fn(english_test_tokenized)
# TOKENIZING URDU TRAIN
urdu_test_tokenized,num_op_tokens_test,urdu_test_tok=Tokenize_fn(urdu_test)
urdu_test=pad_fn(urdu_test_tokenized)

In [24]:
num_ip_tokens_test

2484

**DOING SOME ENCODER DECODER DATA STUFF**

In [25]:
# For decoder input we don't need the last word as it is only for prediction (teacher forcing)
decoder_input_data=urdu_train[:,:-1]
# For decoder output we are one step ahead of input (teacher forcing)
decoder_target_data=urdu_train[:,1:]
# Encoder input data
encoder_input_data=english_train

print("THE DECODER INPUT DATA IS ",decoder_input_data.shape)
print("THE DECODER TARGET DATA IS ",decoder_target_data.shape)
print("THE ENCODER INPUT DATA IS ",encoder_input_data.shape)

THE DECODER INPUT DATA IS  (18375, 8)
THE DECODER TARGET DATA IS  (18375, 8)
THE ENCODER INPUT DATA IS  (18375, 9)


In [26]:
 # For testing Data
test_decoder_input_data=urdu_test[:,:-1]
test_decoder_target_data=urdu_test[:,1:]
test_encoder_input_data=english_test

print("THE DECODER INPUT DATA IS ",test_decoder_input_data.shape)
print("THE DECODER TARGET DATA IS ",test_decoder_target_data.shape)
print("THE ENCODER INPUT DATA IS ",test_encoder_input_data.shape)

THE DECODER INPUT DATA IS  (4594, 8)
THE DECODER TARGET DATA IS  (4594, 8)
THE ENCODER INPUT DATA IS  (4594, 9)


In [27]:
encoder_input_data[0]

array([   1,    3,   57,    7, 1927,    2,    0,    0,    0])

In [28]:
num_encoder_tokens=num_ip_tokens
num_decoder_tokens=num_op_tokens

print("THE VOCABULARY SIZE FOR ENGLISH IS ", num_encoder_tokens)
print("THE VOCABULARY SIZE FOR URDU IS ", num_decoder_tokens)

THE VOCABULARY SIZE FOR ENGLISH IS  4900
THE VOCABULARY SIZE FOR URDU IS  5077


**ADDIN MODEL HERE SEQUENCE TO SEQUENCE MODEL**

In [29]:
# This is for hidden units and for Embedding layer
latent_dim=300

#===========ENCODER-PART=================
encoder_inputs=tf.keras.layers.Input(shape=(encoder_input_data.shape[1]),name='ENCODER-INPUT-LAYER')
x=tf.keras.layers.Embedding(num_encoder_tokens,latent_dim, mask_zero=False ,name='EMBEDDING-LAYER-ENCODER')(encoder_inputs)
x=tf.keras.layers.BatchNormalization(name='ENCODER-NORMALIZATION')(x)
_,state_h=tf.keras.layers.GRU(latent_dim,return_state=True, name='ENCODER-GRU-LAYER')(x)
encoder_model=tf.keras.Model(inputs=encoder_inputs,outputs=state_h, name='ENCODER-MODEL')
seq2seq_encoder_out=encoder_model(encoder_inputs)
#==========DECODER-PART==================
decoder_inputs=tf.keras.layers.Input(shape=(None,),name='DECODER-INPUT-LAYER')
y=tf.keras.layers.Embedding(num_decoder_tokens,latent_dim,name='DECODER-EMBEDDING-LAYER')(decoder_inputs)
y=tf.keras.layers.BatchNormalization(name='DECODER-NORMALIZATION-1')(y)
decoder_gru=tf.keras.layers.GRU(latent_dim,return_state=True,name='DECODER-GRU',return_sequences=True)
decoder_gru_output,_=decoder_gru(y,initial_state=seq2seq_encoder_out)
x=tf.keras.layers.BatchNormalization(name='DECODER-NORMALIZATION-2')(decoder_gru_output)
decoder_dense=tf.keras.layers.Dense(num_decoder_tokens,activation='softmax',name='FINAL-OUTPUT-LAYER')
decoder_outputs=decoder_dense(x)
#============SEQUENCE TO SEQUENCE MODEL======
seq2seq_model=tf.keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=decoder_outputs)
seq2seq_model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001), loss='sparse_categorical_crossentropy')


In [30]:
seq2seq_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 DECODER-INPUT-LAYER (InputLaye  [(None, None)]      0           []                               
 r)                                                                                               
                                                                                                  
 DECODER-EMBEDDING-LAYER (Embed  (None, None, 300)   1523100     ['DECODER-INPUT-LAYER[0][0]']    
 ding)                                                                                            
                                                                                                  
 ENCODER-INPUT-LAYER (InputLaye  [(None, 9)]         0           []                               
 r)                                                                                           

In [55]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                               patience=3,  
                               restore_best_weights=True)

In [58]:
history=seq2seq_model.fit([encoder_input_data,decoder_input_data],decoder_target_data,batch_size=1200,epochs=70,
                          validation_split=0.1,callbacks=[early_stopping])

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70


In [31]:
#seq2seq_model.save_weights("SEQUENCE-TO-SEQUENCE-MODEL.h5")
seq2seq_model.load_weights("SEQUENCE-TO-SEQUENCE-MODEL.h5")

In [32]:
pred=seq2seq_model.predict([test_encoder_input_data,test_decoder_input_data])



In [33]:
pred.shape

(4594, 8, 5077)