<a href="https://colab.research.google.com/github/Satyake/Getting-Started-with-TF2/blob/main/Neural_Translation_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import tensorflow as tf
import numpy as np 

In [None]:
data_path='/content/eng-fra.txt'

In [None]:
with open(data_path,'r',encoding='utf-8') as f:
  lines=f.read()
#lines

In [None]:
#function to remove the spacings and tabs
def to_lines(text):
  sents=text.strip().split('\n')
  sents=[i.split('\t') for i in sents] 
  return sents

In [None]:
fra_eng=to_lines(lines)
#fra_eng

In [None]:
fra_eng=np.array(fra_eng)

In [None]:
fra_eng[:5]

array([['English', 'French'],
       ['Go.', 'Va !'],
       ['Run!', 'Cours\u202f!'],
       ['Run!', 'Courez\u202f!'],
       ['Wow!', 'Ã‡a alors\u202f!']], dtype='<U349')

In [None]:
fra_eng.shape

(135843, 2)

In [None]:
fra_eng=fra_eng[:9000]

In [None]:
fra_eng.shape

(9000, 2)

In [None]:
cleaned_english=[]
for j in range(0,9000):
  cleaned_english.append(''.join([i for i in fra_eng[:,0][j] if i not in ',!.']))

cleaned_french=[]
for j in range(0,9000):
  cleaned_french.append(''.join([i for i in fra_eng[:,1][j] if i not in ',!.']))



In [None]:
cleaned_english=np.array(cleaned_english)
cleaned_french=np.array(cleaned_french)

In [None]:
cleaned_french.shape

(9000,)

In [None]:
cleaned_french.shape

(9000,)

In [None]:
import pandas as pd
df1=pd.DataFrame(cleaned_english)
df2=pd.DataFrame(cleaned_french)

In [None]:
eng_fra_cleaned=pd.concat([df1,df2],axis=1)

In [None]:
new_header = eng_fra_cleaned.iloc[0] #grab the first row for the header
df = eng_fra_cleaned[1:] #take the data less the header row
df.columns = new_header

In [None]:
def tokenization(lines):
  tokenizer=tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

def encode_seq(tokenizer,length,lines):
  seq=tokenizer.texts_to_sequences(lines)
  seq=tf.keras.preprocessing.sequence.pad_sequences(seq,maxlen=length, padding='post')
  return seq

In [None]:
eng_tokenizer=tokenization(df['English'])
fra_tokenizer=tokenization(df['French'])

In [None]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(fra_eng,test_size=0.02,random_state=1)

In [None]:
train.shape

(8820, 2)

In [None]:
x_train=encode_seq(fra_tokenizer,8,train[:,0])
y_train=encode_seq(eng_tokenizer,8,train[:,1])

x_test=encode_seq(fra_tokenizer,8,test[:,0])
y_test=encode_seq(eng_tokenizer,8,test[:,1])

In [None]:
def model1(in_vocab,out_vocab,in_timestep,out_timesteps,units):
    model=tf.keras.Sequential()
    model.add( tf.keras.layers.Embedding(in_vocab,units,input_length=in_timestep,mask_zero=True))
    model.add(tf.keras.layers.LSTM(units))
    #model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units),merge_mode='mul'))
    model.add(tf.keras.layers.RepeatVector(out_timesteps))
    model.add(tf.keras.layers.LSTM(units,return_sequences=True))
    model.add(tf.keras.layers.Dense(out_vocab,activation='softmax'))
  
    return model




In [None]:
eng_length=len(eng_tokenizer.word_index)
fra_length=len(fra_tokenizer.word_index)

In [None]:
fra_length

4214

In [None]:
model=model1(eng_length,fra_length,8,8,512)

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['Accuracy'])

In [None]:
history=model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import numpy as np 
preds=np.argmax(model.predict(x_test),axis=1)

In [None]:
len(preds)

180

In [None]:
def get_word(n,tokenizer):
  for word , index in tokenizer.word_index.items():
    if index==n:
      return word
  return None
  


In [None]:

preds_text=[]
for i in preds:
  temp=[]
  for j in range(len(i)):
    t=get_word(i[j],eng_tokenizer)
    if j>0:
      if (t==get_word(i[j-1],eng_tokenizer)) or (t==None):
        temp.append('')
      else:
        temp.append(t)
    else:
        if (t==None):
          temp.append('')
        else:
          temp.append(t)
  preds_text.append(' '.join(temp))
       

In [None]:
preds_text

In [None]:
pred_df=pd.DataFrame({'actual':test[:,0],'predicted':preds_text})

In [None]:
pred_df.head(10)