In [1]:
import collections
import numpy as np
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3143555052273652829
xla_global_id: -1
]


In [3]:
def load_data(path):
  input_file=path
  with open(input_file,"r") as f:
    data = f.read()
    return data.split('\n')

In [4]:
english_sentences = load_data('data/english')
french_sentences = load_data('data/french')

In [5]:
english_sentences[:5]

['new jersey is sometimes quiet during autumn , and it is snowy in april .',
 'the united states is usually chilly during july , and it is usually freezing in november .',
 'california is usually quiet during march , and it is usually hot in june .',
 'the united states is sometimes mild during june , and it is cold in september .',
 'your least liked fruit is the grape , but my least liked is the apple .']

In [6]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [8]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = ['The quick brown fox jumps over lazy dog.',
                  'By Jove, my quick study of lexicography won the prize.',
                  'This is a short sentence.']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()

for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('Input: {}'.format(sent))
    print('Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'over': 6, 'lazy': 7, 'dog': 8, 'by': 9, 'jove': 10, 'my': 11, 'study': 12, 'of': 13, 'lexicography': 14, 'won': 15, 'prize': 16, 'this': 17, 'is': 18, 'a': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
Input: The quick brown fox jumps over lazy dog.
Output: [1, 2, 3, 4, 5, 6, 7, 8]
Sequence 2 in x
Input: By Jove, my quick study of lexicography won the prize.
Output: [9, 10, 11, 2, 12, 13, 14, 15, 1, 16]
Sequence 3 in x
Input: This is a short sentence.
Output: [17, 18, 19, 20, 21]


In [9]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x:'.format(sample_i+1))
    print('  Input: {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x:
  Input: [1 2 3 4 5 6 7 8]
  Output: [1 2 3 4 5 6 7 8 0 0]
Sequence 2 in x:
  Input: [ 9 10 11  2 12 13 14 15  1 16]
  Output: [ 9 10 11  2 12 13 14 15  1 16]
Sequence 3 in x:
  Input: [17 18 19 20 21]
  Output: [17 18 19 20 21  0  0  0  0  0]


In [10]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

# Call preprocess function with your English and French sentences
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 200
French vocabulary size: 345


In [11]:
def logits_to_text(logits,tokenizer):
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'

  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])

# Model: RNN

In [12]:
def simple_mod(input_shape, output_seq_length, english_vocab_size, french_vocab_size):

  learning_rate=0.005

  model = Sequential()
  model.add(GRU(256, input_shape = input_shape[1:], return_sequences=True))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(learning_rate),
                metrics = ['accuracy'])

  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape(-1, preproc_french_sentences.shape[-2], 1)

simple_rnn_model = simple_mod(
  tmp_x.shape,
  max_french_sequence_length,
  english_vocab_size,
  french_vocab_size
)

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size = 1024,epochs=10, validation_split=0.2)

  super().__init__(**kwargs)


Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 4s/step - accuracy: 0.4691 - loss: 2.5725 - val_accuracy: 0.6371 - val_loss: 1.2693
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 4s/step - accuracy: 0.6325 - loss: 1.2711 - val_accuracy: 0.6742 - val_loss: 1.0796
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 4s/step - accuracy: 0.6652 - loss: 1.1041 - val_accuracy: 0.6868 - val_loss: 0.9806
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 4s/step - accuracy: 0.6812 - loss: 1.0106 - val_accuracy: 0.7084 - val_loss: 0.8966
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 4s/step - accuracy: 0.6918 - loss: 0.9487 - val_accuracy: 0.7143 - val_loss: 0.8556
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 4s/step - accuracy: 0.6991 - loss: 0.9036 - val_accuracy: 0.7239 - val_loss: 0.8079
Epoch 7/10
[1m108/108

<keras.src.callbacks.history.History at 0x14d3b6eb790>

In [14]:
print("Prediction:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))
print("\n Correct Translation:")
print(french_sentences[:1])
print("\n Original Text")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
new jersey est parfois calme en mois de mai et il il en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

 Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 Original Text
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


# Model 2: Bidirectional RNN

In [15]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

  #Hyperparameters
  learning_rate = 0.002

  #Build the layers
  model = Sequential()
  model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=input_shape[1:]))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  model.compile(loss = sparse_categorical_crossentropy,
                optimizer = Adam(learning_rate),
                metrics = ['accuracy'])
  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

bd_rnn_model = bd_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
print(bd_rnn_model.summary())

bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

  super().__init__(**kwargs)


None
Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m428s[0m 4s/step - accuracy: 0.4901 - loss: 2.7275 - val_accuracy: 0.6354 - val_loss: 1.3458
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 3s/step - accuracy: 0.6290 - loss: 1.3315 - val_accuracy: 0.6623 - val_loss: 1.1467
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 4s/step - accuracy: 0.6573 - loss: 1.1672 - val_accuracy: 0.6800 - val_loss: 1.0494
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 3s/step - accuracy: 0.6727 - loss: 1.0769 - val_accuracy: 0.6920 - val_loss: 0.9812
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 3s/step - accuracy: 0.6830 - loss: 1.0128 - val_accuracy: 0.6981 - val_loss: 0.9243
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 6s/step - accuracy: 0.6906 - loss: 0.9636 - val_accuracy: 0.7104 - val_loss: 0.8809
Epoch 7/10
[1m10

<keras.src.callbacks.history.History at 0x14d3b118340>

In [1]:
!pip install tensorflow-addons


ERROR: Could not find a version that satisfies the requirement tensorflow-addons (from versions: none)
ERROR: No matching distribution found for tensorflow-addons


In [16]:
print("Prediction:")
print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))
print("\n Correct Translation:")
print(french_sentences[:1])
print("\n Original Text")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
new jersey est parfois calme en mois et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

 Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 Original Text
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


# Model 3: Embedding

In [27]:
def bid_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

  # Hyperparameter
  learning_rate = 0.005

  model = Sequential()
  model.add(Embedding(english_vocab_size, 256, input_shape=input_shape[1:]))
  model.add(Bidirectional(GRU(256, return_sequences=True)))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  #Compile Model
  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(learning_rate),
                metrics=['accuracy'])
  return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

embedded_model = bid_embed_model(
 tmp_x.shape,
 max_french_sequence_length,
 english_vocab_size,
 french_vocab_size)

print(embedded_model.summary())

embedded_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

None
Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m900s[0m 8s/step - accuracy: 0.6013 - loss: 2.0332 - val_accuracy: 0.8912 - val_loss: 0.3440
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m815s[0m 7s/step - accuracy: 0.8984 - loss: 0.3204 - val_accuracy: 0.9482 - val_loss: 0.1687
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m735s[0m 7s/step - accuracy: 0.9455 - loss: 0.1737 - val_accuracy: 0.9634 - val_loss: 0.1213
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m832s[0m 8s/step - accuracy: 0.9603 - loss: 0.1274 - val_accuracy: 0.9697 - val_loss: 0.0998
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m770s[0m 7s/step - accuracy: 0.9686 - loss: 0.1012 - val_accuracy: 0.9746 - val_loss: 0.0844
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m749s[0m 7s/step - accuracy: 0.9736 - loss: 0.0853 - val_accuracy: 0.9752 - val_loss: 0.0829
Epoch 7/10
[1m10

<keras.src.callbacks.history.History at 0x14d3fca7ee0>

In [55]:
print("Prediction:")
print(logits_to_text(embedded_model.predict(tmp_x[:1])[0], french_tokenizer))
print("\n Correct Translation:")
print(french_sentences[:1])
print("\n Original Text")
print(english_sentences[:1])

Prediction:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

 Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

 Original Text
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [None]:
embedded_model.save('english_to_french_model')

# Serialize English tokenizer to json
with open('english_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))

# Serialize French tokenizer to json
with open('french_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))

# Save model
max_french_sequence_length_json = max_french_sequence_length
with open('sequence_length.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))