#Sequence to Sequence Model for LanguageTranslation (40pt)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import numpy as np
import os

## Reading Sentence Translation Data

In [None]:
data_file = "/content/drive/My Drive/lang/mar-eng/mar.txt"
#data_file = "/content/drive/My Drive/lang/spa-eng/spa.txt"
f = open(data_file, 'r')
data_lines = f.readlines()

In [None]:
language_x,language_y = [],[]
for line in data_lines:
  sentence = line.split("\t")
  language_x.append(sentence[1].strip())
  language_y.append(sentence[0].strip())

In [None]:
language_x[170],language_y[170]

('मी गरीब आहे.', "I'm poor.")

In [None]:
len(language_x)

40188

## Tokenizing data

In [None]:
tokenizer_X = Tokenizer(oov_token="<unk>",filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer_X.word_index['<pad>'] = 0
tokenizer_X.index_word[0] = '<pad>'
tokenizer_X.fit_on_texts(language_x)
sequence_x = tokenizer_X.texts_to_sequences(language_x)

tokenizer_y = Tokenizer(oov_token="<unk>",filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer_y.word_index['<pad>'] = 0
tokenizer_y.index_word[0] = '<pad>'
tokenizer_y.fit_on_texts(language_y)
sequence_y = tokenizer_y.texts_to_sequences(language_y)

In [None]:
sequence_x[170],sequence_y[170]

([3, 657, 2], [29, 598])

## Padding Data

In [None]:
sequence_x_padded = pad_sequences(sequence_x, padding='post')
sequence_y_padded = pad_sequences(sequence_y, padding='post')

In [None]:
sequence_x_padded[40000]

array([    8,  4119,    43,  5105,    91, 13517,    43,  4627,   264,
         934,    63,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [None]:
sequence_y_padded[40000]

array([ 36,  53, 655, 499,  19,   8, 323,  81, 588,  40,   8, 334,  81,
       759,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

## Splitting into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split( sequence_x_padded, sequence_y_padded, test_size=0.05, random_state=12)

In [None]:
X_train

array([[   3, 1038,  145, ...,    0,    0,    0],
       [   6,  505,   37, ...,    0,    0,    0],
       [  59,  782, 1970, ...,    0,    0,    0],
       ...,
       [  40, 6279, 9100, ...,    0,    0,    0],
       [   9,   62,  184, ...,    0,    0,    0],
       [3862, 2932,    2, ...,    0,    0,    0]], dtype=int32)

In [None]:
y_train

array([[  2,  77,   6, ...,   0,   0,   0],
       [412,   2, 138, ...,   0,   0,   0],
       [112,  71,  41, ...,   0,   0,   0],
       ...,
       [ 14, 305,  43, ...,   0,   0,   0],
       [  4,  45,  56, ...,   0,   0,   0],
       [ 39,   7,   8, ...,   0,   0,   0]], dtype=int32)

In [None]:
print(tokenizer_X.sequences_to_texts([X_train[0]])[0].replace("<unk>","").strip())
print(tokenizer_y.sequences_to_texts([y_train[0]])[0].replace("<unk>","").strip())

मी दुकानात गेलो
i went to the shop


## Creating LSTM model

*   List item
*   List item



In [None]:
x_vocab_size = len(tokenizer_X.word_index) + 1
y_vocab_size = len(tokenizer_y.word_index) + 1
x_length = len(X_train[0])
y_length = len(y_train[0])

print("Vocab size of original language: ",x_vocab_size)
print("Vocab size of translated language: ",y_vocab_size)
print("Vector length of each sentence in original language: ",x_length)
print("Vector length of each sentence i translated language: ",y_length)

Vocab size of original language:  13786
Vocab size of translated language:  5857
Vector length of each sentence in original language:  35
Vector length of each sentence i translated language:  35


In [None]:
model = Sequential()
model.add(Embedding(x_vocab_size, 100, input_length=x_length, mask_zero=True))
model.add(LSTM(100))
model.add(RepeatVector(y_length))
model.add(LSTM(100, return_sequences=True))
model.add(Dense(y_vocab_size, activation='softmax'))
rms = RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 35, 100)           1378600   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 35, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 35, 100)           80400     
_________________________________________________________________
dense (Dense)                (None, 35, 5857)          591557    
Total params: 2,130,957
Trainable params: 2,130,957
Non-trainable params: 0
_________________________________________________________________


## Fitting Model

In [2]:
import os
import tensorflow as tf
checkpoint_path = "/content/drive/My Drive/lang/Checkpt_spa/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
checkpoint_path = "/content/drive/My Drive/lang/Checkpt_spa/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
model.load_weights(latest)

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=50, validation_split=0.1,callbacks=[cp_callback])

Epoch 1/50
Epoch 00001: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 2/50
Epoch 00002: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 3/50
Epoch 00003: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 4/50
Epoch 00004: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 5/50
Epoch 00005: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 6/50
Epoch 00006: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 7/50
Epoch 00007: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 8/50
Epoch 00008: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 9/50
Epoch 00009: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 10/50
Epoch 00010: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 11/50
Epoch 00011: saving model to /content/drive/My Drive/lang/Checkpt_spa/cp.ckpt
Epoch 12/50
Epoch 0

## Validating Prediction

In [None]:
X_test.shape

In [None]:
y_pred_logits = model.predict(X_test[:200])

In [None]:
y_pred=[]
for pred in y_pred_logits: 
  y_pred.append(np.argmax(pred,1))

In [None]:
y_pred_text = tokenizer_y.sequences_to_texts(y_pred)
y_test_text = tokenizer_y.sequences_to_texts(y_test[:200])
X_test_text = tokenizer_X.sequences_to_texts(X_test[:200])

In [None]:
y_pred_text[145]

'we made him made <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

In [None]:
print("Original Sentence: ",X_test_text[145].replace("<unk>","").strip())
print("Predicted Sentence: ",y_pred_text[145].replace("<unk>","").strip())
print("Actual Sentence: ",y_test_text[145].replace("<unk>","").strip())

Original Sentence:  आम्ही त्यांना पळवून लावलं
Predicted Sentence:  we made him made
Actual Sentence:  we drove them out


# PART 2 (Embedding using GloVe)

In [None]:
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split( sequence_y_padded, sequence_x_padded, test_size=0.05, random_state=12)
tokenizer_X_g = tokenizer_y
tokenizer_y_g = tokenizer_X

In [None]:
x_vocab_size_g = len(tokenizer_X_g.word_index) + 1
y_vocab_size_g = len(tokenizer_y_g.word_index) + 1
x_length_g = len(X_train_g[0])
y_length_g = len(y_train_g[0])

print("Vocab size of original language: ",x_vocab_size_g)
print("Vocab size of translated language: ",y_vocab_size_g)
print("Vector length of each sentence in original language: ",x_length_g)
print("Vector length of each sentence i translated language: ",y_length_g)

Vocab size of original language:  5857
Vocab size of translated language:  13786
Vector length of each sentence in original language:  35
Vector length of each sentence i translated language:  35


In [None]:
embeddings_index = {}
f = open(os.path.join('/content/drive/My Drive/glove.6B/', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
word_i= tokenizer_X_g.word_index

In [None]:
embedding_matrix = np.zeros((len(word_i) + 1, 100))
for word, i in word_i.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Model with GloVe Embedding

In [None]:
model_g = Sequential()
model_g.add(Embedding(len(word_i) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=x_length_g,
                            trainable=False))
model_g.add(LSTM(100))
model_g.add(RepeatVector(y_length_g))
model_g.add(LSTM(100, return_sequences=True))
model_g.add(Dense(y_vocab_size_g, activation='softmax'))
rms = RMSprop(lr=0.001)
model_g.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
model_g.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 100)           585700    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 35, 100)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 35, 100)           80400     
_________________________________________________________________
dense_1 (Dense)              (None, 35, 13786)         1392386   
Total params: 2,138,886
Trainable params: 1,553,186
Non-trainable params: 585,700
_________________________________________________________________


In [None]:
checkpoint_path_g = "/content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt"
checkpoint_dir_g = os.path.dirname(checkpoint_path_g)

# Create a callback that saves the model's weights
cp_callback_g = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_g,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
checkpoint_path = "/content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path_g)
latest = tf.train.latest_checkpoint(checkpoint_dir_g)
model_g.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc5fe7ec9e8>

In [None]:
history_g = model_g.fit(X_train_g, y_train_g, batch_size=64, epochs=50, validation_split=0.1,callbacks=[cp_callback_g])

Epoch 1/50
Epoch 00001: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 2/50
Epoch 00002: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 3/50
Epoch 00003: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 4/50
Epoch 00004: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 5/50
Epoch 00005: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 6/50
Epoch 00006: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 7/50
Epoch 00007: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 8/50
Epoch 00008: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 9/50
Epoch 00009: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 10/50
Epoch 00010: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ckpt
Epoch 11/50
Epoch 00011: saving model to /content/drive/My Drive/lang/Checkpt_spa_g/cp.ck

## Model 2 prediciton (Eng -> Other lang)

In [None]:
y_pred_logits_g = model_g.predict(X_test_g)

y_pred_g=[]
for pred_g in y_pred_logits_g: 
  y_pred_g.append(np.argmax(pred_g,1))

In [None]:
y_pred_text_g = tokenizer_y_g.sequences_to_texts(y_pred_g)
y_test_text_g = tokenizer_y_g.sequences_to_texts(y_test_g)
X_test_text_g = tokenizer_X_g.sequences_to_texts(X_test_g)

In [None]:
print("Original Sentence: ",X_test_text_g[256].replace("<unk>","").strip())
print("Predicted Sentence: ",y_pred_text_g[256].replace("<unk>","").strip())
print("Actual Sentence: ",y_test_text_g[256].replace("<unk>","").strip())

Original Sentence:  where's my mama
Predicted Sentence:  माझी माझी कुठे आहे
Actual Sentence:  माझी मम्मा कुठे आहे


## Part 3: Testing on 5 examples

In [None]:
five_example_sentence= np.array([X_test_g[120],X_test_g[-2],X_test_g[256],X_test_g[123],X_test_g[456]])

In [None]:
five_example_sentence.shape

(5, 35)

#### Prediction from Model 2 (Eng -> Chosen Lang)

In [None]:
y_pred_logits_model2 = model_g.predict(five_example_sentence)

y_pred_model2 =[]
for pred_model2  in y_pred_logits_model2 : 
  y_pred_model2.append(np.argmax(pred_model2 ,1))

In [None]:
y_pred_text_model2  = tokenizer_y_g.sequences_to_texts(y_pred_model2)
y_pred_text_model2_cleaned =[]
for predicted in y_pred_text_model2:
  y_pred_text_model2_cleaned.append(predicted.replace("<unk>","").strip())

In [None]:
print(y_pred_text_model2_cleaned)

['टॉमला येऊ', 'मला माझी पुस्तक दे', 'माझी माझी कुठे आहे', 'टॉम काहीतरी होता होता', 'टॉम आणि मेरी दोघेही आहेत']


#### Prediction from Model 1 (Chosen Lang -> Eng)

In [None]:
model1_test_data_5 = pad_sequences(tokenizer_X.texts_to_sequences(y_pred_text_model2_cleaned),maxlen=X_test.shape[1],padding='post')

In [None]:
y_pred_logits_model1 = model.predict(np.array(model1_test_data_5))

y_pred_model1=[]
for pred_model1 in y_pred_logits_model1: 
  y_pred_model1.append(np.argmax(pred_model1,1))

In [None]:
y_pred_text_model1 = tokenizer_y.sequences_to_texts(y_pred_model1)
y_pred_text_model1_cleaned =[]
for predicted_model1 in y_pred_text_model1:
  y_pred_text_model1_cleaned.append(predicted_model1.replace("<unk>","").strip())

#### FINAL RESULTS

In [None]:
original_example_sentence = tokenizer_y.sequences_to_texts(five_example_sentence)
original_cleaned=[]
for original in original_example_sentence:
  original_cleaned.append(original.replace("<unk>","").strip())

In [None]:
for i in range(len(five_example_sentence)):
  print("Original Sentence:\t\t",original_cleaned[i])
  print("Predicted Sentence By Model 2:\t",y_pred_text_model2_cleaned[i])
  print("Predicted Sentence By Model 1:\t",y_pred_text_model1_cleaned[i])

Original Sentence:		 let tom in
Predicted Sentence By Model 2:	 टॉमला येऊ
Predicted Sentence By Model 1:	 tom come come
Original Sentence:		 give me my bag
Predicted Sentence By Model 2:	 मला माझी पुस्तक दे
Predicted Sentence By Model 1:	 give me the book
Original Sentence:		 where's my mama
Predicted Sentence By Model 2:	 माझी माझी कुठे आहे
Predicted Sentence By Model 1:	 where's my tom
Original Sentence:		 tom was hiding something
Predicted Sentence By Model 2:	 टॉम काहीतरी होता होता
Predicted Sentence By Model 1:	 tom was something
Original Sentence:		 tom and mary both laughed
Predicted Sentence By Model 2:	 टॉम आणि मेरी दोघेही आहेत
Predicted Sentence By Model 1:	 tom and both both
