# IMPORT LIBRARIES AND DATASETS

In [None]:
from collections import Counter
import operator
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed, RepeatVector, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False) 
# setting the style of the notebook to be monokai theme  
# this line of code is important to ensure that we are able to see the x and y axes clearly
# If you don't run this code line, you will notice that the xlabel and ylabel on any plot is black on black and it will be hard to see them. 


In [None]:
# load the data
df_english = pd.read_csv('small_vocab_en.csv', sep = '/t', names = ['english'])
df_french = pd.read_csv('small_vocab_fr.csv', sep = '/t', names = ['french'])

In [None]:
df_english

In [None]:
df_french

In [None]:
df = pd.concat([df_english, df_french], axis = 1)

In [None]:
df

# PERFORM DATA CLEANING

In [None]:
# download nltk packages
nltk.download('punkt')

# download stopwords
nltk.download("stopwords")

In [None]:
# function to remove punctuations
def remove_punc(x):
    return re.sub('[!#?,.:";]', '', x)

In [None]:
df['french'] = df['french'].apply(remove_punc)
df['english'] = df['english'].apply(remove_punc)

In [None]:
english_words = []
french_words  = []

In [None]:
def get_unique_words(x, word_list):
    for word in x.split():
        if word not in word_list:
            word_list.append(word)

df["english"].apply(lambda x: get_unique_words(x, english_words));
df["french"].apply(lambda x: get_unique_words(x, french_words));

In [None]:
# number of unique words in french
total_english_words = len(english_words)
total_french_words = len(french_words)
print(total_english_words)
print(total_french_words)

# VISUALIZE CLEANED UP DATASET

In [None]:
# Obtain list of all words in the dataset
words = []
for i in df['english']:
    for word in i.split():
        words.append(word)
    
words

In [None]:
# Obtain the total count of words
english_words_counts = Counter(words)
english_words_counts

In [None]:
# sort the dictionary by values
english_words_counts = sorted(english_words_counts.items(), key = operator.itemgetter(1), reverse = True)

In [None]:
english_words_counts

In [None]:
english_words, english_counts = zip(*english_words_counts)
english_words = list(english_words)
english_counts = list(english_counts)

In [None]:
# # append the values to a list for visualization purposes
# english_words = []
# english_counts = []
# for i in range(len(english_words_counts)):
#     english_words.append(english_words_counts[i][0])
#     english_counts.append(english_words_counts[i][1])

In [None]:
english_words

In [None]:
english_counts

In [None]:
# Plot barplot using plotly 
fig = px.bar(x = english_words, y = english_counts)
fig.show()

In [None]:
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000, width = 1600, height = 800 ).generate(" ".join(df.english))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
df.english[0]
nltk.word_tokenize(df.english[0])

In [None]:
# Maximum length (number of words) per document. We will need it later for embeddings
maxlen_english = -1
for doc in df.english:
    tokens = nltk.word_tokenize(doc)
    if(maxlen_english < len(tokens)):
        maxlen_english = len(tokens)
print("The maximum number of words in any document = ", maxlen_english)

In [None]:
# Obtain list of all words in the dataset
words = []
for i in df['french']:
    for word in i.split():
        words.append(word)
    
words

In [None]:
# Obtain the total count of words
french_words_counts = Counter(words)
french_words_counts

In [None]:
# sort the dictionary by values
french_words_counts = sorted(french_words_counts.items(), key = operator.itemgetter(1), reverse = True)

In [None]:
french_words_counts

In [None]:
french_words, french_counts = zip(*french_words_counts)
french_words = list(french_words)
french_counts = list(french_counts)

In [None]:
french_words

In [None]:
french_counts

In [None]:
# Plot barplot using plotly 
fig = px.bar(x = french_words, y = french_counts)
fig.show()

In [None]:
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000, width = 1600, height = 800 ).generate(" ".join(df.french))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
df.french[0]
nltk.word_tokenize(df.french[0])

In [None]:
# Maximum length (number of words) per document. We will need it later for embeddings
maxlen_french = -1
for doc in df.french:
    tokens = nltk.word_tokenize(doc)
    if(maxlen_french < len(tokens)):
        maxlen_french = len(tokens)
print("The maximum number of words in any document = ", maxlen_french)

# PREPARE THE DATA BY PERFORMING TOKENIZATION AND PADDING

In [None]:
def tokenize_and_pad(x, maxlen):
    #  a tokenier to tokenize the words and create sequences of tokenized words
    tokenizer = Tokenizer(char_level = False)
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    padded = pad_sequences(sequences, maxlen = maxlen, padding = 'post')
    
    return tokenizer, sequences, padded

In [None]:
# tokenize and padding to the data 
maxlen = max(maxlen_english, maxlen_french)
x_tokenizer, x_sequences, x_padded = tokenize_and_pad(df.english, maxlen)
y_tokenizer, y_sequences, y_padded = tokenize_and_pad(df.french,  maxlen)

In [None]:
# Total vocab size, since we added padding we add 1 to the total word count
english_vocab_size = total_english_words + 1
print("Complete English Vocab Size:", english_vocab_size)

In [None]:
# Total vocab size, since we added padding we add 1 to the total word count
french_vocab_size = total_french_words + 1
print("Complete French Vocab Size:", french_vocab_size)

In [None]:
print("The tokenized version for document\n", df.english[-1:].item(),"\n is : ", x_padded[-1:])

In [None]:
print("The tokenized version for document\n", df.french[-1:].item(),"\n is : ", y_padded[-1:])

In [None]:
# function to obtain the text from padded variables
def pad_to_text(padded, tokenizer):

    id_to_word = {id: word for word, id in tokenizer.word_index.items()}
    id_to_word[0] = ''

    return ' '.join([id_to_word[j] for j in padded])

In [None]:
pad_to_text(y_padded[0], y_tokenizer)

In [None]:
# Train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_padded, y_padded, test_size = 0.1)

# BUILD AND TRAIN THE MODEL 

In [None]:
# Sequential Model
model = Sequential()
# embedding layer
model.add(Embedding(english_vocab_size, 256, input_length = maxlen, mask_zero = True))
# encoder
model.add(LSTM(256))
# decoder
# repeatvector repeats the input for the desired number of times to change
# 2D-array to 3D array. For example: (1,256) to (1,23,256)
model.add(RepeatVector(maxlen))
model.add(LSTM(256, return_sequences= True ))
model.add(TimeDistributed(Dense(french_vocab_size, activation ='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# change the shape of target from 2D to 3D
y_train = np.expand_dims(y_train, axis = 2)
y_train.shape

In [None]:
# train the model
model.fit(x_train, y_train, batch_size=1024, validation_split= 0.1, epochs=10)

In [None]:
# save the model
model.save("weights.h5")

In [None]:
import tensorflow as tf
from plot_model import plot_model
plot_model(
    model, 
    to_file='model.png', 
    show_shapes=True, 
    show_layer_names=False, 
    rankdir='TB', 
    expand_nested=False, 
    style=0, 
    color=True, 
    dpi=96
)

# ASSESS TRAINED MODEL PERFORMANCE


In [None]:
# function to make prediction
def prediction(x, x_tokenizer = x_tokenizer, y_tokenizer = y_tokenizer):
    predictions = model.predict(x)[0]
    id_to_word = {id: word for word, id in y_tokenizer.word_index.items()}
    id_to_word[0] = ''
    return ' '.join([id_to_word[j] for j in np.argmax(predictions,1)])

In [None]:
for i in range(5):
    print('Original English word - {}\n'.format(pad_to_text(x_test[i], x_tokenizer)))
    print('Original French word - {}\n'.format(pad_to_text(y_test[i], y_tokenizer)))
    print('Predicted French word - {}\n\n\n\n'.format(prediction(x_test[i:i+1])))