In [0]:
import collections
import tensorflow as tf
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [0]:
import os
def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [0]:
# Loading English data
english_sentences = load_data('/content/drive/My Drive/Colab Notebooks/data/small_vocab_en')
# Loading French data
french_sentences = load_data('/content/drive/My Drive/Colab Notebooks/data/small_vocab_fr')

In [0]:
import pickle

def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    # saving the tokens
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return tokenizer.texts_to_sequences(x), tokenizer
ts = ['']
ts_tokenized, ts_tokenizer = tokenize(ts)

In [0]:
def toke(x):
    # loading the saved token
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [0]:
def tokenize_y(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [0]:
def pad(x, length=None):
   
    return pad_sequences(x, maxlen=length, padding='post')

In [0]:
def preprocess(x, y):
    
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize_y(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

In [0]:
def logits_to_text(logits, tokenizer):
    
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [149]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    # Hyperparameters
    learning_rate = 0.005
    
    # Building the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256, return_sequences=True))    
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compiling the model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Reshaping the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# TODO: Train the neural network
embed_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

embed_rnn_model.summary()

embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 21, 256)           51200     
_________________________________________________________________
gru_5 (GRU)                  (None, 21, 256)           393984    
_________________________________________________________________
time_distributed_9 (TimeDist (None, 21, 1024)          263168    
_________________________________________________________________
dropout_5 (Dropout)          (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_10 (TimeDis (None, 21, 345)           353625    
Total params: 1,061,977
Trainable params: 1,061,977
Non-trainable params: 0
_________________________________________________________________
Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoc

<keras.callbacks.History at 0x7f18644c3f98>

In [0]:
# Saving the model
embed_rnn_model.save('model.h5')
# and reloading it with:
reloaded_model = tf.keras.models.load_model('model.h5')

In [0]:
def prep(x):
    
    preprocess_x, x_tk = toke(x)
    preprocess_x = pad(preprocess_x)

    return preprocess_x, x_tk

In [152]:
sente = ['how are you']
print('In english: ', sente)

preproc_sente, english_tokenizer = prep(sente)

tx = pad(preproc_sente, preproc_french_sentences.shape[1])
tx = tx.reshape((-1, preproc_french_sentences.shape[-2]))

logi = reloaded_model.predict(tx[:1])[0]
print("Prediction to French:")
print(logits_to_text(logi, french_tokenizer))

In english:  ['how are you']
Prediction to French:
comment sont vous <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
