# Machine translation

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
# Loading Dataset
ftr=open('/content/drive/My Drive/Projects/Machine Translation/small_vocab_fr','r')
ftrain=ftr.read()
ftrain=ftrain.split('\n')
ftr.close()
etr=open('/content/drive/My Drive/Projects/Machine Translation/small_vocab_en','r')
etrain=etr.read()
etrain=etrain.split('\n')
etr.close()

In [None]:
# Text preprocessing
# We will create sequence of 100 words

import tensorflow as tf
tokenizer=tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(ftrain)
fseq=tokenizer.texts_to_sequences(ftrain)
fpad=tf.keras.preprocessing.sequence.pad_sequences(
    fseq, maxlen=None, dtype='int32', padding='post', truncating='post',
    value=0)


tokenizer1=tf.keras.preprocessing.text.Tokenizer()
tokenizer1.fit_on_texts(etrain)
eseq=tokenizer1.texts_to_sequences(etrain)
epad=tf.keras.preprocessing.sequence.pad_sequences(
    eseq, maxlen=None, dtype='int32', padding='post', truncating='post',
    value=0)

In [None]:
# Update the vocabulary
# Adding word for pad and start token
# for english
tokenizer1.word_index['<pad>']=0
en_itow={i:j for j,i in tokenizer1.word_index.items()}

# for french
tokenizer.word_index['<pad>']=0
tokenizer.word_index['<start>']=345
fr_itow={i:j for j,i in tokenizer.word_index.items()}

print('French_vocab size {}\nEnglish_vocab size {}'.format(len(tokenizer.word_index),len(tokenizer1.word_index)))

French_vocab size 346
English_vocab size 200


In [None]:
print('Shapes:\nFrench:{}\nEnglish:{}'.format(fpad.shape,epad.shape))

Shapes:
French:(137861, 21)
English:(137861, 15)


In [None]:
len(tokenizer1.word_index)

200

In [None]:
inp=epad[0:2]
em=tf.keras.layers.Embedding(200,12)(inp)
l=tf.keras.layers.Bidirectional(tf.keras.layers.GRU(15,activation='relu',return_state=True,return_sequences=True))(em)

In [None]:
for i in l:
  print(i.shape)

(2, 15, 30)
(2, 15)
(2, 15)


In [None]:
l2=tf.keras.layers.Bidirectional(tf.keras.layers.GRU(15,activation='relu',return_state=True))(em)

In [None]:
for i in l2:
  print(i.shape)

(2, 30)
(2, 15)
(2, 15)


In [None]:
x=tf.keras.layers.Embedding(346,12)(inp)
o=tf.keras.layers.GRU(21,activation='relu',return_state=True,return_sequences=True)(x)

In [None]:
for i in o:
  print(i.shape)

(60, 15, 21)
(60, 21)


In [None]:
inp.shape

(60, 15)

In [None]:
# Building encoder with bidirectional lstm
tf.keras.backend.clear_session()

class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder,self).__init__()
    self.em=tf.keras.layers.Embedding(200,12)
    self.drop=tf.keras.layers.Dropout(0.2)
    self.drop1=tf.keras.layers.Dropout(0.2)
    self.enc=tf.keras.layers.Bidirectional(tf.keras.layers.GRU(15,activation='relu',return_state=True,return_sequences=True))
    self.enc1=tf.keras.layers.Bidirectional(tf.keras.layers.GRU(15,activation='relu',return_state=True))

  def call(self,inputs):
    emb=self.em(inputs)
    d1=self.drop(emb)
    en1=self.enc(d1)
    d2=self.drop1(en1[0])
    en2=self.enc1(d2)
    out=tf.reshape(en2[0],(en2[0].shape[0],1,30))
    return out

enc=Encoder()

In [None]:
inp=epad[:5]
h=enc(inp)

In [None]:
h.shape

TensorShape([5, 1, 30])

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder,self).__init__()
    self.em=tf.keras.layers.Embedding(346,12)
    self.drop=tf.keras.layers.Dropout(0.2)
    self.drop1=tf.keras.layers.Dropout(0.2)
    self.dec=tf.keras.layers.GRU(21,activation='relu',return_state=True,return_sequences=True)
    self.dec1=tf.keras.layers.GRU(21,activation='relu',return_state=True,return_sequences=True)
    self.dense=tf.keras.layers.Dense(346,activation='sigmoid')

  def call(self,enc_hidden,dec_hidden,dec_hidden2,inp):
    x=self.em(inp)
    dec_hidden=tf.reshape(dec_hidden,shape=(dec_hidden.shape[0],1,dec_hidden.shape[-1]))
    concat=tf.keras.layers.concatenate([x,enc_hidden,dec_hidden],axis=2)
    drop=self.drop(concat)
    dec_out,dec_hidden=self.dec(drop)
    #drop1=self.drop1(dec_out)
    dec_hidden2=tf.reshape(dec_hidden2,shape=(dec_hidden2.shape[0],1,dec_hidden2.shape[-1]))
    concat1=tf.keras.layers.concatenate([dec_out,dec_hidden2],axis=2)
    drop1=self.drop1(concat1)
    dec_out2,dec_hidden2=self.dec1(drop1)
    pred=self.dense(dec_out2)
    return dec_hidden,pred,dec_hidden2

dec=Decoder()

In [None]:
# Creating requred APIs
optimizer = tf.keras.optimizers.Adam()
loss_ = tf.keras.losses.SparseCategoricalCrossentropy()
acc=tf.keras.metrics.Accuracy()

In [None]:

def train_step(inp,out):
  batch=inp.shape[0]
  initializer=tf.random_normal_initializer(seed=33)
  dec_hidden=tf.Variable(initial_value=initializer(shape=(batch,21),dtype=tf.float32))
  dec_hidden2=tf.Variable(initial_value=initializer(shape=(batch,21),dtype=tf.float32))
  dec_input=np.array([345]*batch).reshape(batch,1)
  pred_out=[]
  with tf.GradientTape() as tape:
    enc_hidden=enc(inp)
    for i in range(out.shape[1]):
      dec_hidden,pred,dec_hidden2=dec(enc_hidden,dec_hidden,dec_hidden2,dec_input)
      dec_input=out[:,i].reshape((batch,1))
      pred_out.append(pred)
    ypred=tf.concat(pred_out,axis=1)
    ls=loss_(out,ypred)

  variables = enc.trainable_variables + dec.trainable_variables
  gradients = tape.gradient(ls, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return ls,ypred

In [None]:
# Training...
trn=tf.data.Dataset.from_tensor_slices((epad,fpad)).batch(60)
epochs=10
print('Training starts ...')
for j in range(epochs):
  print('\n\n---Epoch {}  :----'.format(j))
  a=[]
  l=[]
  for i,(inp,out) in enumerate(trn):
    inp=inp.numpy()
    out=out.numpy()
    ls,ypred=train_step(inp,out)
    ypred=tf.math.argmax(ypred,axis=2)
    ypred=tf.reshape(ypred,shape=out.shape)
    ac=acc(ypred,out)
    l.append(ls)
    a.append(ac)
    if i%10==0:
      #print('At {}th batch    mean_loss={} , mean_accuracy={}'.format(i,np.mean(l),np.mean(a)))
      ma=np.mean(a)
      ml=np.mean(l)
      a=[]
      l=[]
    if i%1000==0:
      print('At {}th batch    mean_loss={} , mean_accuracy={}'.format(i,ml,ma))
 

Training starts ...


---Epoch 0  :----
At 0th batch    mean_loss=1.1899663209915161 , mean_accuracy=0.3198452889919281


KeyboardInterrupt: ignored

In [None]:
# creating Accuracy function

def acc(y_true,y_pred):
  sm=0
  total=0
  y_pred=tf.math.argmax(y_pred,axis=2)
  y_true=tf.cast(y_true,dtype=tf.int64)
  y_true=tf.reshape(y_true,[len(y_true),21])
  for i in range(len(y_true)):
    mask=tf.cast(y_true[i],dtype=tf.bool)
    y=tf.boolean_mask(y_true[i],mask)
    p=tf.boolean_mask(y_pred[i],mask)
    _=y==p
    tmp=tf.reduce_sum(tf.cast(_, tf.int32))
    total=total+len(y)
    sm=sm+tf.reduce_sum(tf.cast(_, tf.int32))
  return sm/total

In [None]:
# predicting a sample

def predict(inp):
  batch=inp.shape[0]
  initializer=tf.random_normal_initializer(seed=33)
  dec_hidden=tf.Variable(initial_value=initializer(shape=(batch,1,21),dtype=tf.float32))
  dec_hidden2=tf.Variable(initial_value=initializer(shape=(batch,1,21),dtype=tf.float32))
  dec_input=np.array([345]*batch).reshape(batch,1)
  pred_out=[]
  enc_hidden=enc(inp)
  for i in range(21):
    dec_hidden,pred,dec_hidden2=dec(enc_hidden,dec_hidden,dec_hidden2,dec_input)
    #dec_input=out[:,i].reshape((batch,1))
    dec_input=tf.reshape(tf.math.argmax(pred,axis=2),shape=(batch,1))
    pred_out.append(pred)
  ypred=tf.concat(pred_out,axis=1)
  ypred=tf.math.argmax(ypred,axis=2)
  return ypred.numpy()


In [None]:
x, y, x_tk, y_tk=epad,fpad,tokenizer1,tokenizer

In [None]:
from keras.preprocessing.sequence import pad_sequences
y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
y_id_to_word[0] = '<PAD>'
sentence = 'he saw a old yellow truck'
sentence = [x_tk.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
sentences = np.array([sentence[0], x[0]])
predictions = predict(sentences)

print('Sample 1:')
print(' '.join([y_id_to_word[x] for x in predictions[0]]))
print('Il a vu un vieux camion jaune')
print('Sample 2:')
print(' '.join([y_id_to_word[x] for x in predictions[1]]))
print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

Sample 1:
les inde est oranges et les bananes <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
Sample 2:
les inde est jamais froid en l' et et il est jamais froid en l' automne <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
# Testing sample inputs

inp=epad[0:10]
ya=fpad[0:10]
y=predict(inp)
for i in range(10):
  fa=[fr_itow[j] for j in ya[i]]
  fp=[fr_itow[j] for j in y[i]]
  print('Actual french ={}\nPrdicted French = {}'.format(fa,fp))