<a href="https://colab.research.google.com/github/Satyake/NLP/blob/master/Machine_Translation%20_Eng-Fra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk
!pip install gensim
!pip install spacy
!pip install plotly
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np 
import re
import tensorflow as tf

In [3]:

df_english=pd.read_csv('/content/small_vocab_en.csv',sep='\t',names=['english'])
df_french=pd.read_csv('/content/small_vocab_fr.csv',sep='\t',names=['french'])


In [4]:
df_english.head()


Unnamed: 0,english
0,"new jersey is sometimes quiet during autumn , ..."
1,the united states is usually chilly during jul...
2,"california is usually quiet during march , and..."
3,the united states is sometimes mild during jun...
4,"your least liked fruit is the grape , but my l..."


In [5]:
df_french.head()

Unnamed: 0,french
0,new jersey est parfois calme pendant l' automn...
1,les états-unis est généralement froid en juill...
2,"california est généralement calme en mars , et..."
3,"les états-unis est parfois légère en juin , et..."
4,"votre moins aimé fruit est le raisin , mais mo..."


In [6]:
df_concat=pd.concat([df_english,df_french],axis=1)

In [7]:
df_concat

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...
2,"california is usually quiet during march , and...","california est généralement calme en mars , et..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo..."
...,...,...
137855,"france is never busy during march , and it is ...","la france est jamais occupée en mars , et il e..."
137856,"india is sometimes beautiful during spring , a...","l' inde est parfois belle au printemps , et il..."
137857,"india is never wet during summer , but it is s...","l' inde est jamais mouillé pendant l' été , ma..."
137858,"france is never chilly during january , but it...","la france est jamais froid en janvier , mais i..."


In [8]:
print('Total English Records={}'.format(len(df_concat['english'])))
print('Total French Records={}'.format(len(df_concat['french'])))

Total English Records=137860
Total French Records=137860


In [9]:
#to remove punc
def remove_punc(x):
  return re.sub('[!#?,.;:]','',x)

In [10]:
df_concat['french']=df_concat['french'].apply(remove_punc)
df_concat['english']=df_concat['english'].apply(remove_punc)

In [11]:
english_words=[]
french_words=[]

In [12]:
def unique_words(df):
  unique_words=[]
  tknizer=tf.keras.preprocessing.text.Tokenizer()
  tknizer.fit_on_texts(df)
  dictionary=tknizer.word_index
  print(len(tknizer.word_index))
  for item1, item2 in dictionary.items():
    unique_words.append(item1)
  return len(unique_words)


In [13]:
len_eng=unique_words(df_concat['english'])

199


In [14]:
len_fra=unique_words(df_concat['french'])

344


In [15]:
maxlen_english = -1
tknizer=tf.keras.preprocessing.text.Tokenizer()
tknizer.fit_on_texts(df_concat['english'])
len(tknizer.word_index)


199

In [16]:
def max_num_words_in_rows(df):
  maxlen_english = -1
  counts=[]
  for doc in df_concat.english:
    tokens = nltk.word_tokenize(doc)
    counts.append(len(tokens))
  return max(counts)

In [17]:
maxlen_english=max_num_words_in_rows(df_concat['english'])
maxlen_french=max_num_words_in_rows(df_concat['french'])

In [18]:
maxlen_english

15

In [19]:
#Tokenization and Padings

In [20]:
def tokenize_and_pad(x,max_len):
  tokenizer=tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(x)
  sequences=tokenizer.texts_to_sequences(x)
  padded=tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=max_len,padding='post')
  return tokenizer, sequences,padded

In [21]:
x_tokenizer,x_sequences,x_padded=tokenize_and_pad(df_concat['english'],maxlen_english)
y_tokenizer,y_sequences,y_padded=tokenize_and_pad(df_concat['french'],maxlen_french)

In [22]:
print(x_sequences[0])
print(y_sequences[0])
print(x_padded[0])
print(y_padded[0])

[17, 23, 1, 8, 67, 4, 39, 7, 3, 1, 55, 2, 44]
[35, 34, 1, 8, 67, 37, 11, 24, 6, 3, 1, 112, 2, 50]
[17 23  1  8 67  4 39  7  3  1 55  2 44  0  0]
[ 35  34   1   8  67  37  11  24   6   3   1 112   2  50   0]


In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_padded,y_padded,test_size=0.1)

In [24]:
x_train.shape

(124074, 15)

In [32]:
#Building the Network
eng_vocab_size=len_eng+1
fra_vocab_size=len_fra+1
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(eng_vocab_size,256,input_length=maxlen_english,mask_zero=True))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300),merge_mode='mul'))
model.add(tf.keras.layers.RepeatVector(maxlen_french))
model.add(tf.keras.layers.LSTM(25,return_sequences=True))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(fra_vocab_size,activation='softmax')))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 256)           51200     
                                                                 
 bidirectional (Bidirectiona  (None, 300)              1336800   
 l)                                                              
                                                                 
 repeat_vector_1 (RepeatVect  (None, 15, 300)          0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 15, 25)            32600     
                                                                 
 time_distributed_1 (TimeDis  (None, 15, 345)          8970      
 tributed)                                                       
                                                      

In [43]:
#y_train=np.expand_dims(y_train,axis=2)
y_train.shape

(124074, 15)

In [34]:
eng_vocab_size

200

In [35]:
x_train.shape

(124074, 15)

In [38]:
#rlop=tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy')
rllr=tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',min_delta=0.03,patience=30)

In [39]:
history=model.fit(x_train,y_train,batch_size=1024,validation_split=0.25,epochs=200,callbacks=[rllr])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [40]:
model.save('lstm.h5')

In [41]:
x_test.shape

(13786, 15)

In [42]:
y_test.shape

(13786, 15)

In [44]:
y_pred=model.predict(x_test)

In [None]:
y_pred

In [46]:
def prediction(x,x_tokenizer=x_tokenizer,y_tokenizer=y_tokenizer):
  predictions=model.predict(x)[0]
  id_to_word={ id: word for word, id in y_tokenizer.word_index.items()}
  id_to_word[0]=''
  return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])
  

In [47]:
def pad_to_text(padded,tokenizer):
  id_to_word={id: word for word,id in tokenizer.word_index.items()}
  id_to_word[0]=''
  return ' '.join([id_to_word[j] for j in padded])

In [53]:
for i in range(5):
  print('Original English-{}\n' .format(pad_to_text(x_test[i],x_tokenizer)))
  print('Original French-{}\n' .format(pad_to_text(x_test[i],x_tokenizer)))
  print( 'Predicted-{}\n\n\n\n' .format(prediction(x_test[i:i+1])))

Original English-he likes oranges and mangoes          

Original French-he likes oranges and mangoes          

Predicted-il aime les oranges et les mangues        




Original English-india is chilly during june and it is wonderful in may    

Original French-india is chilly during june and it is wonderful in may    

Predicted-l' inde est froid en juin et il est merveilleux en mai   




Original English-he likes limes oranges and mangoes         

Original French-he likes limes oranges and mangoes         

Predicted-il aime les oranges les citrons verts et les mangues     




Original English-the orange is your least liked fruit but the apple is his least liked 

Original French-the orange is your least liked fruit but the apple is his least liked 

Predicted-l'orange est votre fruit moins aimé mais la fraise est son moins aimé  




Original English-california is usually relaxing during spring but it is never busy in september  

Original French-california is usually relaxing d