In [None]:
# Install the dependencies if not installed
!pip install --upgrade tensorflow-gpu=2.3.0

!pip install nltk

!pip install gensim

!pip install spacy

!pip install plotly

!pip install pandas

!pip install numpy

!pip install matplotlib

!pip install seaborn

!pip install wordcloud

!pip install sklearn

In [37]:
# Here we will import all required libraries

import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed, LSTM, Embedding, RepeatVector
import tensorflow as tf

In [2]:
# Load the dataset
df_english = pd.read_csv('small_vocab_en.csv', sep='/t', names=['english'])
df_french = pd.read_csv('small_vocab_fr.csv', sep='/t', names=['french'])

  df_english = pd.read_csv('small_vocab_en.csv', sep='/t', names=['english'])
  df_french = pd.read_csv('small_vocab_fr.csv', sep='/t', names=['french'])


In [3]:
# Visualize english data
df_english

Unnamed: 0,english
0,"new jersey is sometimes quiet during autumn , ..."
1,the united states is usually chilly during jul...
2,"california is usually quiet during march , and..."
3,the united states is sometimes mild during jun...
4,"your least liked fruit is the grape , but my l..."
...,...
137855,"france is never busy during march , and it is ..."
137856,"india is sometimes beautiful during spring , a..."
137857,"india is never wet during summer , but it is s..."
137858,"france is never chilly during january , but it..."


In [4]:
# Visualize french data
df_french

Unnamed: 0,french
0,new jersey est parfois calme pendant l' automn...
1,les Ã©tats-unis est gÃ©nÃ©ralement froid en ju...
2,"california est gÃ©nÃ©ralement calme en mars , ..."
3,"les Ã©tats-unis est parfois lÃ©gÃ¨re en juin ,..."
4,"votre moins aimÃ© fruit est le raisin , mais m..."
...,...
137855,"la france est jamais occupÃ©e en mars , et il ..."
137856,"l' inde est parfois belle au printemps , et il..."
137857,"l' inde est jamais mouillÃ© pendant l' Ã©tÃ© ,..."
137858,"la france est jamais froid en janvier , mais i..."


In [5]:
# Check if there is any NULL data
print(df_english.isnull().sum())
print(df_french.isnull().sum())

english    0
dtype: int64
french    0
dtype: int64


In [8]:
# Concat both side by side for easy viewing
df = pd.concat([df_english, df_french], axis=1)
df

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les Ã©tats-unis est gÃ©nÃ©ralement froid en ju...
2,"california is usually quiet during march , and...","california est gÃ©nÃ©ralement calme en mars , ..."
3,the united states is sometimes mild during jun...,"les Ã©tats-unis est parfois lÃ©gÃ¨re en juin ,..."
4,"your least liked fruit is the grape , but my l...","votre moins aimÃ© fruit est le raisin , mais m..."
...,...,...
137855,"france is never busy during march , and it is ...","la france est jamais occupÃ©e en mars , et il ..."
137856,"india is sometimes beautiful during spring , a...","l' inde est parfois belle au printemps , et il..."
137857,"india is never wet during summer , but it is s...","l' inde est jamais mouillÃ© pendant l' Ã©tÃ© ,..."
137858,"france is never chilly during january , but it...","la france est jamais froid en janvier , mais i..."


In [9]:
def remove_punc(x):
    return re.sub('[!#?,.:";]', '', x)

In [10]:
df['english'] = df['english'].apply(remove_punc)
df['french'] = df['french'].apply(remove_punc)

In [11]:
df

Unnamed: 0,english,french
0,new jersey is sometimes quiet during autumn a...,new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les Ã©tats-unis est gÃ©nÃ©ralement froid en ju...
2,california is usually quiet during march and ...,california est gÃ©nÃ©ralement calme en mars e...
3,the united states is sometimes mild during jun...,les Ã©tats-unis est parfois lÃ©gÃ¨re en juin ...
4,your least liked fruit is the grape but my le...,votre moins aimÃ© fruit est le raisin mais mo...
...,...,...
137855,france is never busy during march and it is s...,la france est jamais occupÃ©e en mars et il e...
137856,india is sometimes beautiful during spring an...,l' inde est parfois belle au printemps et il ...
137857,india is never wet during summer but it is so...,l' inde est jamais mouillÃ© pendant l' Ã©tÃ© ...
137858,france is never chilly during january but it ...,la france est jamais froid en janvier mais il...


In [13]:
# To get number of unique words

english_words = []
french_words = []

def get_unique(x, word_list):
    for word in x.split():
        if word not in word_list:
            word_list.append(word)
        
df['english'].apply(lambda x:get_unique(x,english_words))
df['french'].apply(lambda x:get_unique(x,french_words))

total_english_words = len(english_words)
total_french_words = len(french_words)

print("Number of unique english words ", total_english_words)
print("Number of unique english words ", total_french_words)

Number of unique english words  199
Number of unique english words  350


In [14]:
# Find max number of words in sentence so we can later do padding

maxlen_english = -1
for doc in df.english:
    tokens = nltk.word_tokenize(doc)
    if maxlen_english < len(tokens):
        maxlen_english = len(tokens)
        
print("Max number of words in any english sentence is ", maxlen_english)

maxlen_french = -1
for doc in df.french:
    tokens = nltk.word_tokenize(doc)
    if maxlen_french < len(tokens):
        maxlen_french = len(tokens)
        
print("Max number of words in any french sentence is ", maxlen_french)

Max number of words in any english sentence is  15
Max number of words in any french sentence is  24


In [19]:
# Create a tokenier to tokenize the words and create sequences of tokenized words
def tokenize_and_pad(x,maxlen):
    tokenizer = Tokenizer(char_level = False)
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    padded = pad_sequences(sequences, maxlen=maxlen, padding = 'post')
    return tokenizer, sequences, padded

In [20]:
x_tokenizer, x_sequences, x_padded = tokenize_and_pad(df.english, maxlen_english)
y_tokenizer, y_sequences, y_padded = tokenize_and_pad(df.french,  maxlen_french)

In [22]:
# Visualize example of a token
print("The tokenized version for english document\n", df.english[-1:].item(),"\n is : ", x_padded[-1:])

print("\n \n")

print("The tokenized version for french document\n", df.french[-1:].item(),"\n is : ", y_padded[-1:])

The tokenized version for english document
 the orange is her favorite fruit  but the banana is your favorite  
 is :  [[ 5 84  1 32 11 13  6  5 87  1 29 11  0  0  0]]

 

The tokenized version for french document
 l'orange est son fruit prÃ©fÃ©rÃ©  mais la banane est votre favori  
 is :  [[84  1 20 16 17  5  7 87  1 40 93  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [25]:
# Function to obtain text from padded
def pad_to_text(padded, tokenizer):
    id_to_word = {id: word for word, id in tokenizer.word_index.items()}
    id_to_word[0] = ''
    
    return ' '.join([id_to_word[j] for j in padded])

In [26]:
pad_to_text(y_padded[0], y_tokenizer)

"new jersey est parfois calme pendant l' automne et il est neigeux en avril          "

In [35]:
# Split data into train and test set
x_train, x_test, y_train, y_test = train_test_split(x_padded, y_padded, test_size=0.1)

In [30]:
# Total vocab size, since we added padding we add 1 to the total word count
english_vocab_size = total_english_words + 1
print("Complete English Vocab Size:", english_vocab_size)

french_vocab_size = total_french_words + 1
print("Complete French Vocab Size:", french_vocab_size)

Complete English Vocab Size: 200
Complete French Vocab Size: 351


In [33]:
#BUILDING THE MODEL
model = Sequential()

# Add an embedding layer
model.add(Embedding(english_vocab_size, 256, input_length = maxlen_english, mask_zero = True))

# Encoder
model.add(LSTM(256))

# decoder
# repeatvector repeats the input for the desired number of times to change 2D-array to 3D array
model.add(RepeatVector(maxlen_french))
model.add(LSTM(256, return_sequences = True))
model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 256)           51200     
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 24, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 24, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 24, 351)           90207     
Total params: 1,192,031
Trainable params: 1,192,031
Non-trainable params: 0
_________________________________________________________________


In [36]:
# change the shape of target from 2D to 3D
y_train = np.expand_dims(y_train, axis = 2)
y_train.shape

(124074, 24, 1)

In [38]:
# Train the model
with tf.device('/gpu:0'):
    model.fit(x_train, y_train, batch_size=128, validation_split = 0.1, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
# function to make prediction
def prediction(x, x_tokenizer = x_tokenizer, y_tokenizer = y_tokenizer):
    predictions = model.predict(x)[0]
    id_to_word = {id: word for word, id in y_tokenizer.word_index.items()}
    id_to_word[0] = ''
    return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])

In [40]:
# Let us take 5 examples
for i in range(5):

  print('Original English word - {}\n'.format(pad_to_text(x_test[i], x_tokenizer)))
  print('Original French word - {}\n'.format(pad_to_text(y_test[i], y_tokenizer)))
  print('Predicted French word - {}\n\n\n\n'.format(prediction(x_test[i:i+1])))

Original English word - paris is quiet during november and it is pleasant in march    

Original French word - paris est calme au mois de novembre et il est agrã©able en mars           

Predicted French word - paris est calme au mois de novembre et il en mars             




Original English word - he drives the rusty black truck         

Original French word - il conduit le camion noir rouillã©                  

Predicted French word - il conduit le camion noir rouillã©                  




Original English word - paris is never beautiful during march but it is wonderful in november   

Original French word - paris est jamais belle en mars mais il est merveilleux en novembre            

Predicted French word - paris est jamais belle en mars mais il est merveilleux en novembre            




Original English word - paris is sometimes snowy during october and it is never wonderful in summer  

Original French word - paris est parfois enneigã©e en octobre et il est jamais merveill

In [41]:
# Save the weights separately
model.save_weights('Final_weights.h5')

#Save the model (including weights)
model.save('model.h5')

In [43]:
# To load to model we saved elsewhere
model1 = tf.keras.models.load_model('model.h5')

In [53]:
# To predict using the model we saved
# function to make prediction
def prediction(x, x_tokenizer = x_tokenizer, y_tokenizer = y_tokenizer):
    predictions = model1.predict(x)[0]
    id_to_word = {id: word for word, id in y_tokenizer.word_index.items()}
    id_to_word[0] = ''
    return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])

In [54]:
print('Predicted French word - {}\n\n\n\n'.format(prediction(x_test[0:1])))

Predicted French word - paris est calme au mois de novembre et il en mars             




