In [3]:
%load_ext autoreload
%aimport helper, tests
%autoreload 1
%run project_tests.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import collections

import helper
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from tensorflow.keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from sklearn.model_selection import train_test_split

In [5]:
def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [6]:
#Load English data
english_sentences = load_data('small_vocab_en.txt')
#Load French data
french_sentences = load_data('small_vocab_fr.txt')

In [7]:
#Check the first 5 lines from each file
for i in range(5):
    print('English sample {}:  {}'.format(i + 1, english_sentences[i]))
    print('French sample {}:  {}\n'.format(i + 1, french_sentences[i]))

English sample 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French sample 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

English sample 2:  the united states is usually chilly during july , and it is usually freezing in november .
French sample 2:  les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .

English sample 3:  california is usually quiet during march , and it is usually hot in june .
French sample 3:  california est gÃ©nÃ©ralement calme en mars , et il est gÃ©nÃ©ralement chaud en juin .

English sample 4:  the united states is sometimes mild during june , and it is cold in september .
French sample 4:  les Ã©tats-unis est parfois lÃ©gÃ¨re en juin , et il fait froid en septembre .

English sample 5:  your least liked fruit is the grape , but my least liked is the apple .
French sample 5:  votre moins aimÃ© fruit est le raisin , mais mon moins aimÃ© est la pomme .



In [8]:
#Check the complexity of the dataset by checking the words
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} Unique English words'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 Unique English words
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [9]:
#Function for tokenizing
def tokenize(sentences):
    """
    Tokenizes sentences
    param sentences: List of sentences/string to be tokenized
    return: Tuple of tokenized x data and the tokenizer used to tokenize x
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer.texts_to_sequences(sentences),tokenizer

In [10]:
#Test the tokenize function
test_tokenize(tokenize)

#Tokenize an example
text_sentences = ['The quick brown fox jumps over the lazy dog .','By Jove , my quick study of lexicography won a prize .','This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print('\n')
for i,(input, output) in enumerate(zip(text_sentences, text_tokenized)):
    print(f'Sequence {i + 1} in x')
    print(f'   Input:  {input}')
    print(f'   Output:  {output}')
    


{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}


Sequence 1 in x
   Input:  The quick brown fox jumps over the lazy dog .
   Output:  [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
   Input:  By Jove , my quick study of lexicography won a prize .
   Output:  [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
   Input:  This is a short sentence .
   Output:  [18, 19, 3, 20, 21]


In [11]:
#Pad each sequence to be the same length

#Function for padding
def pad(list_to_pad,length=None):
    """
    Pad list_to_pad
    param list_to_pad: List of sequences.
    param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    return: Padded numpy array of sequences
    """
    answer = pad_sequences(list_to_pad,maxlen=length,padding='post')
    return answer

#Test the pad function
#test_pad(pad)

In [12]:
#Pad the tokenized output
test_run = pad(text_tokenized)
for i,(token_sent,pad_sent) in enumerate(zip(text_tokenized,test_run)):
    print(f'Sequence {i + 1} in x')
    print(f'Input: {np.array(token_sent)}')
    print(f'Output: {pad_sent}') 

Sequence 1 in x
Input: [1 2 4 5 6 7 1 8 9]
Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
Input: [10 11 12  2 13 14 15 16  3 17]
Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
Input: [18 19  3 20 21]
Output: [18 19  3 20 21  0  0  0  0  0]


In [13]:
#Function for creating a preprocess pipeline
def preprocess(x,y):
    """
    Preprocess x and y
    x: Feature list of sentences
    y: Label list of sentences
    return a tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [14]:
#Preprocess the english and test_sentences
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]

english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


In [15]:
#Create a function  to map word ids back to text
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [253]:
def simple_model(input_shape,output_sequence_length,english_vocab_size,french_vocab_size):
    """
    Build and train a basic RNN on x ans y
    param input_shape: Tuple of the input shape
    param output_sequence_length: Length of output sequence
    param english_vocab_size: Numbe of unique english words in the dataset
    param french_vocab_size: Number of  unique french words in the dataset
    return: Keras model built
    """
    #Build the layers
    learning_rate = 0.005
    model = Sequential()
    model.add(GRU(256,input_shape=input_shape[1:],return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    #Compile the model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [254]:
#Test the simple_model function
test_simple_model(simple_model)

In [255]:
# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

In [256]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

In [257]:
#Train your neural network
simple_rnn_model = simple_model(tmp_x.shape,max_french_sequence_length,english_vocab_size,french_vocab_size)

print(simple_rnn_model.summary())

simple_rnn_model.fit(X_train[:80000],y_train[:80000],batch_size=1024,epochs=10,validation_split=0.2)

None
Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 959ms/step - accuracy: 0.4203 - loss: 2.9953 - val_accuracy: 0.6008 - val_loss: 1.5159
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 922ms/step - accuracy: 0.5977 - loss: 1.4886 - val_accuracy: 0.6406 - val_loss: 1.2421
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 927ms/step - accuracy: 0.6333 - loss: 1.2676 - val_accuracy: 0.6684 - val_loss: 1.1255
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 933ms/step - accuracy: 0.6550 - loss: 1.1631 - val_accuracy: 0.6743 - val_loss: 1.0651
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 931ms/step - accuracy: 0.6665 - loss: 1.0982 - val_accuracy: 0.6910 - val_loss: 0.9864
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 930ms/step - accuracy: 0.6783 - loss: 1.0324 - val_accuracy: 0.6984 - val_loss: 0.9415
Epoch 7/10
[1m63

<keras.src.callbacks.history.History at 0x1a0598fca10>

In [258]:
# Print your predictions
print("Prediction:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print("\nOriginal text:")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 868ms/step
new jersey est parfois calme en mois de mai et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [265]:
#Let's try a 2nd model using word embedding with the RNNs
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an RNN model using word embedding on x ans y
    param input_shape: Tuple of the input shape
    param output_sequence_length: Length of output sequence
    param english_vocab_size: Numbe of unique english words in the dataset
    param french_vocab_size: Number of  unique french words in the dataset
    return: Keras model built
    """
    #Build the layers
    learning_rate = 0.005
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256,return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    #Compile the model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [266]:
#Test the embed_model function
test_embed_model(simple_model)

  super().__init__(**kwargs)


In [267]:
#Reshape the input
tmp_x = pad(preproc_english_sentences,preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

In [268]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

In [269]:
# TODO: Train the neural network
embed_rnn_model = embed_model(tmp_x.shape,preproc_french_sentences.shape[1],len(english_tokenizer.word_index)+1,len(french_tokenizer.word_index)+1)

  super().__init__(**kwargs)


In [270]:
embed_rnn_model.summary()

In [272]:
embed_rnn_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 1s/step - accuracy: 0.5561 - loss: 2.2544 - val_accuracy: 0.8272 - val_loss: 0.5272
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.8444 - loss: 0.4816 - val_accuracy: 0.8841 - val_loss: 0.3453
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 1s/step - accuracy: 0.8904 - loss: 0.3308 - val_accuracy: 0.9117 - val_loss: 0.2664
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 1s/step - accuracy: 0.9093 - loss: 0.2715 - val_accuracy: 0.9205 - val_loss: 0.2337
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.9192 - loss: 0.2392 - val_accuracy: 0.9205 - val_loss: 0.2286
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.9237 - loss: 0.2235 - val_accuracy: 0.9291 - val_loss: 0.2063
Epoch 7/10
[1m87/87[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1a06784fb50>

In [273]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print("\nOriginal text:")
print(english_sentences[:1])


Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 796ms/step
new jersey est parfois calme en l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [274]:
#Let's try a third model. For RNNs, they cannot see the future input. We can use bidirectional recurrent neural netowrks to see the future and past data.
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a bidirectional RNN model on x and y
    param input_shape: Tuple of input shape
    param output_sequence_length: Length of output sequence
    param english_vocab_size: Number of unique English words in the dataset
    param french_vocab_size: Number of unique French words in the dataset
    return: Keras model built, but not trained
    """
    learning_rate = 0.003
    
    #Build the layers
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [275]:
test_bd_model(bd_model)

  super().__init__(**kwargs)


In [276]:
# TODO: Reshape the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

In [277]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

In [278]:
# TODO: Train and Print prediction(s)
embed_rnn_bi_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

embed_rnn_bi_model.summary()

  super().__init__(**kwargs)


In [279]:
embed_rnn_bi_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 1s/step - accuracy: 0.5924 - loss: 2.0202 - val_accuracy: 0.8440 - val_loss: 0.4761
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5956s[0m 69s/step - accuracy: 0.8566 - loss: 0.4356 - val_accuracy: 0.8993 - val_loss: 0.3011
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 2s/step - accuracy: 0.9000 - loss: 0.3006 - val_accuracy: 0.9141 - val_loss: 0.2463
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.9136 - loss: 0.2539 - val_accuracy: 0.9235 - val_loss: 0.2254
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 973ms/step - accuracy: 0.9228 - loss: 0.2256 - val_accuracy: 0.9276 - val_loss: 0.2084
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 976ms/step - accuracy: 0.9277 - loss: 0.2089 - val_accuracy: 0.9309 - val_loss: 0.2005
Epoch 7/10
[1m87/87[0m [32m

<keras.src.callbacks.history.History at 0x19eb9621c10>

In [280]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(embed_rnn_bi_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print("\nOriginal text:")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
new jersey est parfois calme en l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [281]:
#For the 4th model, let's try an encoder-decoder model. The encoder creates a matrix representation kf the sentence and the decoder takes this matrix as input and predicts the translation as output
def encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 0.001
    
    # Build the layers    
    model = Sequential()
    # Encoder
    model.add(GRU(256, input_shape=input_shape[1:], go_backwards=True))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [282]:
test_encdec_model(encdec_model)

  super().__init__(**kwargs)


In [283]:
# Reshape the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

In [284]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2, random_state=42)

In [285]:
# Create the model and display its summary
encdec_rnn_model = encdec_model(tmp_x.shape,preproc_french_sentences.shape[1],len(english_tokenizer.word_index)+1,len(french_tokenizer.word_index)+1)
encdec_rnn_model.summary()

In [286]:
encdec_rnn_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 1s/step - accuracy: 0.4039 - loss: 3.2776 - val_accuracy: 0.5364 - val_loss: 1.9052
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 1s/step - accuracy: 0.5432 - loss: 1.8551 - val_accuracy: 0.5749 - val_loss: 1.5639
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 1s/step - accuracy: 0.5866 - loss: 1.5576 - val_accuracy: 0.6182 - val_loss: 1.3848
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 1s/step - accuracy: 0.6139 - loss: 1.4084 - val_accuracy: 0.6400 - val_loss: 1.3179
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 1s/step - accuracy: 0.6296 - loss: 1.3384 - val_accuracy: 0.6499 - val_loss: 1.2612
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 1s/step - accuracy: 0.6421 - loss: 1.2824 - val_accuracy: 0.6564 - val_loss: 1.2138
Epoch 7/10
[1m87/87[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1a055d86cd0>

In [287]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print("\nOriginal text:")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
new jersey est parfois agrã©able en l' et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [16]:
#Now we will try a model where we incorporate embedding and biderectional rnns into our model
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 0.003
    
    # Build the layers    
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],
                         input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [17]:
test_model_final(model_final)




In [18]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(preproc_english_sentences, preproc_french_sentences, test_size=0.2, random_state=42)

In [19]:
#Train neural network using model_final
model = model_final(preproc_english_sentences.shape,preproc_french_sentences.shape[1],len(english_tokenizer	.word_index)+1,len(french_tokenizer.word_index)+1)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 128)           25600     
                                                                 
 bidirectional_2 (Bidirecti  (None, 256)               198144    
 onal)                                                           
                                                                 
 repeat_vector_1 (RepeatVec  (None, 21, 256)           0         
 tor)                                                            
                                                                 
 bidirectional_3 (Bidirecti  (None, 21, 256)           296448    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 21, 512)           131584    
 stributed)                                           

In [20]:
model.fit(X_train, y_train, batch_size=1024, epochs=25, validation_split=0.2)

Epoch 1/25


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x2258ac941d0>

In [21]:
model.save('Translator Model.keras')

