In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,TimeDistributed,Dropout,Bidirectional
import json

In [2]:
def load_data(path):
    with open(path,'r') as f:
       data =  f.read()
    return data.split('\n')

english_data = load_data('data/english')
french_data = load_data('data/french')

In [3]:
print(english_data[:5])
print(french_data[:5])

['new jersey is sometimes quiet during autumn , and it is snowy in april .', 'the united states is usually chilly during july , and it is usually freezing in november .', 'california is usually quiet during march , and it is usually hot in june .', 'the united states is sometimes mild during june , and it is cold in september .', 'your least liked fruit is the grape , but my least liked is the apple .']
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril .", 'les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .', 'california est généralement calme en mars , et il est généralement chaud en juin .', 'les états-unis est parfois légère en juin , et il fait froid en septembre .', 'votre moins aimé fruit est le raisin , mais mon moins aimé est la pomme .']


In [4]:
def tokenize(sentence):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()

    tokenizer.fit_on_texts(sentence)
    
    return tokenizer.texts_to_sequences(sentence), tokenizer


In [5]:
def pad(sentences, max_len = 21):
    if not max_len:
        max_len = max([len(x) for x in sentences])
    return tf.keras.preprocessing.sequence.pad_sequences(sentences,maxlen = max_len,padding= 'post')


In [6]:
def preprocess(x,y):
    preprocess_x,x_tk = tokenize(x)
    preprocess_y,y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
    return preprocess_x,preprocess_y,x_tk,y_tk


In [7]:
eng_preprocess,fre_preprocess,eng_tokenizer,fre_tokenizer = preprocess(english_data,french_data)
eng_vocab_size = len(eng_tokenizer.word_index)+1   #+1 is for padding
fre_vocab_size = len(fre_tokenizer.word_index)+1
max_english_sequence_length = eng_preprocess.shape[1]
max_french_sequence_length = fre_preprocess.shape[1]
print('eng_data_shape ',eng_preprocess.shape)
print('fre_data_shape: ,',fre_preprocess.shape)
print('english_vocabulary: ',eng_vocab_size)
print('french_vocabulary:', fre_vocab_size)
print('eng_seq_length: ',max_english_sequence_length)
print('fre_seq_length: ',max_french_sequence_length)


eng_data_shape  (137861, 21)
fre_data_shape: , (137861, 21, 1)
english_vocabulary:  200
french_vocabulary: 345
eng_seq_length:  21
fre_seq_length:  21


In [8]:
def seq_to_text(logits,tokenizer):
    index_to_word = {id:word for word,id in tokenizer.word_index.items()}
    index_to_word[0] = '<pad>'
    return ' '.join([index_to_word[pred] for pred in np.argmax(logits, 1) if pred!=0])

In [9]:

def lstm_nmt_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer = tf.keras.optimizers.Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad(eng_preprocess, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1,fre_preprocess.shape[-2]))

# Build the model
lstm_model = lstm_nmt_model(
    tmp_x.shape,
    max_french_sequence_length,
    eng_vocab_size,
    fre_vocab_size)
print(lstm_model.summary())
    

  super().__init__(**kwargs)


None


In [10]:
lstm_model.fit(tmp_x, fre_preprocess, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 1s/step - accuracy: 0.5518 - loss: 2.2319 - val_accuracy: 0.8573 - val_loss: 0.4606
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 1s/step - accuracy: 0.8647 - loss: 0.4356 - val_accuracy: 0.9228 - val_loss: 0.2438
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 1s/step - accuracy: 0.9178 - loss: 0.2607 - val_accuracy: 0.9467 - val_loss: 0.1713
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 1s/step - accuracy: 0.9399 - loss: 0.1922 - val_accuracy: 0.9564 - val_loss: 0.1405
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 1s/step - accuracy: 0.9517 - loss: 0.1538 - val_accuracy: 0.9646 - val_loss: 0.1173
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 895ms/step - accuracy: 0.9595 - loss: 0.1292 - val_accuracy: 0.9693 - val_loss: 0.1007
Epoch 7/10
[1m108/1

<keras.src.callbacks.history.History at 0x1672c977730>

In [11]:
# Print prediction(s)
print("Prediciton:")
print(seq_to_text(lstm_model.predict(tmp_x[3:4])[0], fre_tokenizer))

print("\nCorrect Translation:")
print(french_data[3:4])

print('\nOriginal text:')
print(english_data[3:4])

Prediciton:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 690ms/step
les états unis est parfois doux en juin et il fait froid en septembre

Correct Translation:
['les états-unis est parfois légère en juin , et il fait froid en septembre .']

Original text:
['the united states is sometimes mild during june , and it is cold in september .']


In [12]:
lstm_model.save('english_to_french_model.h5')
# Serialize English Tokenizer to JSON
with open('tokenizers/english_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(eng_tokenizer.to_json(), ensure_ascii=False))
    
# Serialize French Tokenizer to JSON
with open('tokenizers/french_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(fre_tokenizer.to_json(), ensure_ascii=False))
    

