<a href="https://colab.research.google.com/github/SadiyaMayat/NLP/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [3]:
# Path to translation file
path_to_data = '/content/hin.txt'

# Read file
translation_file = open(path_to_data, "r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[60:3061]

In [4]:
# Displaying first 5 pairs for verification
print(pairs[:5])

[['Definitely!', 'निश्चित ही', 'CC-BY 2.0 (France) Attribution: tatoeba.org #30542 (CM) & #6178948 (fastrizwaan)'], ["Don't move.", 'हिलो मत।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #433502 (CK) & #588484 (minshirui)'], ['Fire burns.', 'आग जलाती है।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #23865 (CM) & #457112 (minshirui)'], ['Follow him.', 'उसका पीछा करो।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #433754 (CK) & #588487 (minshirui)'], ['I can swim.', 'मुझे तैरना आता है।', 'CC-BY 2.0 (France) Attribution: tatoeba.org #256178 (CK) & #3216259 (anubhav93)']]


In [5]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "।"
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
    return clean_sentence

def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer(filters='')
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

# Clean sentences
hindi_sentences = [clean_sentence(pair[0]) for pair in pairs]
english_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Add start and end tokens to English sentences
english_sentences = ['<start> ' + sentence + ' <end>' for sentence in english_sentences]


In [6]:
# Tokenize words
hin_text_tokenized, hin_text_tokenizer = tokenize(hindi_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length Hindi sentence: {}'.format(len(max(hin_text_tokenized, key=len))))
print('Maximum length English sentence: {}'.format(len(max(eng_text_tokenized, key=len))))

# Check language length
hindi_vocab = len(hin_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Hindi vocabulary is of {} unique words".format(hindi_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

max_hindi_len = int(len(max(hin_text_tokenized, key=len)))
max_english_len = int(len(max(eng_text_tokenized, key=len)))

hin_pad_sentence = pad_sequences(hin_text_tokenized, max_hindi_len, padding="post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding="post")

Maximum length Hindi sentence: 22
Maximum length English sentence: 27
Hindi vocabulary is of 2446 unique words
English vocabulary is of 2947 unique words


In [7]:
# Define the model
latent_dim = 256
embedding_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_hindi_len,))
enc_emb = Embedding(hindi_vocab, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, dropout=0.2, recurrent_dropout=0.2)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_english_len,))
dec_emb_layer = Embedding(english_vocab, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(english_vocab, activation='softmax'))
output = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], output)

# Compile the model
model.compile(optimizer=Adam(1e-3), loss=sparse_categorical_crossentropy, metrics=['accuracy'])
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 22)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 27)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 22, 256)              626176    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 27, 256)              754432    ['input_2[0][0]']             
                                                                                              

In [8]:
# Preparing the data for teacher forcing
decoder_input_data = np.zeros_like(eng_pad_sentence)
decoder_input_data[:, 1:] = eng_pad_sentence[:, :-1]
decoder_input_data[:, 0] = eng_text_tokenizer.word_index['<start>']

# Model training
model_results = model.fit(
    [hin_pad_sentence, decoder_input_data],
    np.expand_dims(eng_pad_sentence, -1),
    batch_size=64,
    epochs=100,
    validation_split=0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [9]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1) if prediction != 0])

index = 14
print("The English sentence is: {}".format(english_sentences[index]))
print("The Hindi sentence is: {}".format(hindi_sentences[index]))
print('The predicted sentence is :')
prediction = model.predict([hin_pad_sentence[index:index+1], decoder_input_data[index:index+1]])
print(logits_to_sentence(prediction[0], eng_text_tokenizer))


The English sentence is: <start> उसे अंदर भेजो <end>
The Hindi sentence is: let him in
The predicted sentence is :
<start> उसे अंदर भेजो <end>
