<a href="https://colab.research.google.com/github/Tommytrungto/Research-Methods-for-Data-Science-with-Python/blob/master/TommyTo_MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install and Import Libraries**

In [None]:
!pip install --upgrade tensorflow-gpu

In [None]:
!pip install nltk

!pip install gensim

!pip install spacy

!pip install plotly

In [None]:
import nltk 

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from collections import Counter
import operator
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed, RepeatVector, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model


In [None]:
!pip install jupyterthemes
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False) 

In [None]:
#import data from Google Drive
df_english = pd.read_csv('/content/drive/My Drive/PhrasesEnglish.csv', sep = '/t', names = ['english'])
df_french = pd.read_csv('/content/drive/My Drive/PhrasesFrench.csv', sep = '/t', names = ['french'])

In [None]:
df_french.info()
df_english.head()

**Data Cleaning**

In [None]:
#Concatenation
df = pd.concat([df_french, df_english], axis=1)
df

Unnamed: 0,french,english
0,new jersey est parfois calme pendant l' automn...,"new jersey is sometimes quiet during autumn , ..."
1,les états-unis est généralement froid en juill...,the united states is usually chilly during jul...
2,"california est généralement calme en mars , et...","california is usually quiet during march , and..."
3,"les états-unis est parfois légère en juin , et...",the united states is sometimes mild during jun...
4,"votre moins aimé fruit est le raisin , mais mo...","your least liked fruit is the grape , but my l..."
...,...,...
137855,"la france est jamais occupée en mars , et il e...","france is never busy during march , and it is ..."
137856,"l' inde est parfois belle au printemps , et il...","india is sometimes beautiful during spring , a..."
137857,"l' inde est jamais mouillé pendant l' été , ma...","india is never wet during summer , but it is s..."
137858,"la france est jamais froid en janvier , mais i...","france is never chilly during january , but it..."


In [None]:
print('Total French phrases: {}'. format(len(df_french)))

Total French phrases: 137860


In [None]:
#this function returns phrases without punctuations
def remove_punctuations(x):
  return re.sub('[!#?,.:";]','',x)

In [None]:
df['english'] = df['english'].apply(remove_punctuations)
df['french'] = df['french'].apply(remove_punctuations)
df['english']

0         new jersey is sometimes quiet during autumn  a...
1         the united states is usually chilly during jul...
2         california is usually quiet during march  and ...
3         the united states is sometimes mild during jun...
4         your least liked fruit is the grape  but my le...
                                ...                        
137855    france is never busy during march  and it is s...
137856    india is sometimes beautiful during spring  an...
137857    india is never wet during summer  but it is so...
137858    france is never chilly during january  but it ...
137859    the orange is her favorite fruit  but the bana...
Name: english, Length: 137860, dtype: object

In [None]:
#list of unique english and french words:
unique_english_words = []
unique_french_words = []

In [None]:
#this function returns unique word list:
def get_unique_words(x, word_list):
  for word in x.split():
    if word not in word_list:
      word_list.append(word)


In [None]:
df['english'].apply(lambda x: get_unique_words(x,unique_english_words))
len(unique_english_words)


199

In [None]:
df['french'].apply(lambda x:get_unique_words(x,unique_french_words))
len(unique_french_words)

350

In [None]:
total_english_words = []
for phrase in df['english']:
  for word in phrase.split():
    total_english_words.append(word)
english_words_counts = Counter(total_english_words)
#Words are sorted by alphabetical order by default
english_words_counts
#sort the words by values
english_words_counts = sorted(english_words_counts.items(), key = operator.itemgetter(1), reverse = True)
english_words_counts

In [None]:
#use Counter and sorted to return words and their counts from high to low
total_french_words = []
for phrase in df['french']:
  for word in phrase.split():
    total_french_words.append(word)
french_words_counts = Counter(total_french_words)
len(french_words_counts)
french_words_counts = sorted(french_words_counts.items(), key = operator.itemgetter(1), reverse = True)
#List of all the words and their counts
french_words_counts

**Data Visualization**

In [None]:
#split words [0] and counts [1] for visualization purpose
english_words = []
english_counts = []

for i in range(len(english_words_counts)):
  english_words.append(english_words_counts[i][0])
  english_counts.append(english_words_counts[i][1])

english_counts

In [None]:
#french words and counts
french_words = []
french_counts = []

for i in range(len(french_words_counts)):
  french_words.append(french_words_counts[i][0])
  french_counts.append(french_words_counts[i][1])

french_words

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 200, width = 1600, height = 800).generate("".join(df.french))
plt.imshow(wc, interpolation='bessel')

In [None]:
#Interactive barplot of english words and their frequency
#pl: plotly.express library
fig = px.bar(x = english_words, y = english_counts)
fig.show()

In [None]:
#wordCloud
plt.figure(figsize=(20,20))
wc = WordCloud(max_words = 200, width = 1600, height = 800).generate(" ".join(df.english))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
df.english[0]
nltk.word_tokenize(df.english[0])

In [None]:
#tokenize english phrases
maxlen_english = 0
for phrase in df.english:
  tokens = nltk.word_tokenize(phrase)
  if(maxlen_english < len(tokens)):
    maxlen_english = len(tokens)
print("The maximum number of words in any phrase = ", maxlen_english)

The maximum number of words in any phrase =  15


In [None]:
#tokenize french phrases
maxlen_french = -1
for phrase in df.french:
  tokens = nltk.word_tokenize(phrase)
  if(maxlen_french < len(tokens)):
    maxlen_french = len(tokens)
maxlen_french

23

**Tokenization and Padding**

In [None]:
def tokenize_and_pad(df, maxlen):
  tokenizer = Tokenizer(char_level=False)
  #fit_on_texts returns word-index based on frequency
  tokenizer.fit_on_texts(df)
  #texts_to_sequence returns sequence of integers from word-index
  sequences = tokenizer.texts_to_sequences(df)
  #pad_sequences added 0 so that all the sequences have the same length
  padded = pad_sequences(sequences, maxlen=maxlen, padding = 'post')
  return tokenizer, sequences, padded

In [None]:
eng_tokenizer, eng_sequences, eng_padded = tokenize_and_pad(df.english, maxlen_english)
fr_tokenizer, fr_sequences, fr_padded = tokenize_and_pad(df.french, maxlen_french)

fr_padded

In [None]:
print("The tokenized version for the last phrase of french df is:\n", df.french[-1:].item()," \n", fr_padded[-1:])
print("\n")
print("The tokenized version for the last phrase of english df is:\n", df.english[-1:].item()," \n", eng_padded[-1:])



In [None]:
#Split train amd test data
from sklearn.model_selection import train_test_split
eng_train, eng_test, fr_train, fr_test = train_test_split(eng_padded, fr_padded, test_size = 0.2)

Gradient Descent: an optimization technique, iteratively minimize the cost function. Long term short term memory(LSTM) to overcome Vanishing Gradient problem.
LSTM uses a horizontal line memory(cell state) to remember and recell information for a prolonged period of time

Encoder-Decoder Model
French -> Embedding Layer -> LSTM (Encoder) -> (RepeatVector -> LSTM) Decoder -> TimeDistributed(Dense) ->English

In [None]:
eng_vocab_size = len(unique_english_words) + 1 

fr_vocab_size = len(unique_french_words) + 1


In [None]:
#Sequential Model
model = Sequential()
#Embedding Layer 
model.add(Embedding(fr_vocab_size, 256, input_length = maxlen_french, mask_zero = True))
#Encoder
model.add(LSTM(256))
#Decoder
model.add(RepeatVector(maxlen_english))
model.add(LSTM(256, return_sequences = True))
#Dense layer
model.add(TimeDistributed(Dense(eng_vocab_size, activation = 'softmax')))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
#Change french shape from 2D to 3D
eng_train = np.expand_dims(eng_train, axis = 2)
eng_train = eng_train.reshape(eng_train.shape[:3])
eng_train.shape

(117181, 15, 1)

In [None]:
len(eng_train)
#512-0.6312
#256-0.6598
#128-0.6493

117181

In [None]:
#Train the model
model.fit(fr_train, eng_train, batch_size = 1024, validation_split = 0.1, epochs = 17)



<tensorflow.python.keras.callbacks.History at 0x7fe633431b70>

**Assess trained model performance**

In [None]:
#generate french predicted arrays of integers
eng_predict = model.predict(fr_test)

eng_predict

In [None]:
def generatePrediction(phrase, eng_tokenizer = eng_tokenizer, fr_tokenizer = fr_tokenizer):
  predictions = model.predict(phrase)[0]
  id_to_word = {id: word for word, id in eng_tokenizer.word_index.items()}
  id_to_word[0] = ''
  return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])

In [None]:
def pad_to_text(padded, tokenizer):
  id_to_word = {id: word for word, id in tokenizer.word_index.items()}
  id_to_word[0] = ''
  return ' '.join([id_to_word[j] for j in padded])

In [None]:
for i in range (9):
  print('Original French sentence: {}\n'.format(pad_to_text(fr_test[i], fr_tokenizer)))
  print('Original English sentence: {}\n'.format(pad_to_text(eng_test[i], eng_tokenizer)))
  print('Predicted English sentence: {}\n\n\n\n'.format(generatePrediction(eng_test[i:i+1])))

Original French sentence: elle déteste les mangues les citrons verts et les pommes             

Original English sentence: she dislikes mangoes limes and apples         

Predicted English sentence: i is sometimes beautiful march and         




Original French sentence: paris est jamais merveilleux en juin mais il est relaxant à l' automne          

Original English sentence: paris is never wonderful during june but it is relaxing in autumn   

Predicted English sentence: india is relaxing during may and it is rainy in summer    




Original French sentence: l' inde est parfois occupée en octobre et il est calme à l' automne         

Original English sentence: india is sometimes busy during october and it is quiet in fall   

Predicted English sentence: france is sometimes quiet during october but it is quiet in summer   




Original French sentence: la chine est relaxant parfois pendant l' hiver mais il est sec en août         

Original English sentence: china is sometimes rel