In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import numpy as np
import time

 ### Read data

In [2]:
file = open("swe.txt", 'r')
lines = file.readlines()

In [3]:
eng_swe = np.array([line.split("\t")[0:2] for line in lines])
english = eng_swe[:, 0]
swedish = eng_swe[:, 1]

### Filter

In [4]:
def punct(elt):
  elts = re.sub("(?<=.)!", " !", elt)
  elts = re.sub("(?<=.)\?", " ?", elts)
  elts = re.sub("(?<=.)\.", " .", elts)
  elts = re.sub("(?<=.),", " ,", elts)
  elts = re.sub("(?<=.);", " ;", elts)
  return elts

def start_end(elt):
  return "<start> " + elt + " <end>"

In [5]:
english_cleaned = [start_end(punct(elt)) for elt in english]
swedish_cleaned = [start_end(punct(elt)) for elt in swedish]

### Tokenize

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [7]:
english_token = Tokenizer(filters='')
english_token.fit_on_texts(english_cleaned)

swedish_token = Tokenizer(filters='')
swedish_token.fit_on_texts(swedish_cleaned)

In [8]:
english_sentences = english_token.texts_to_sequences(english_cleaned)
swedish_sentences = swedish_token.texts_to_sequences(swedish_cleaned)

english_sentences = english_sentences[0:15000]
swedish_sentences = swedish_sentences[0:15000]

In [9]:
english_sentences = pad_sequences(english_sentences, padding='post')
swedish_sentences = pad_sequences(swedish_sentences, padding='post')

### Tf dataset

In [10]:
BUFFER_SIZE = len(english_sentences)
BATCH_SIZE = 64
steps_per_epoch = len(english_sentences)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_english = len(english_token.word_index)+1
vocab_swedish = len(swedish_token.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((english_sentences, swedish_sentences)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

### Models

In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.layers import Layer

In [12]:
class Encoder(Model):

  def __init__(self, units, vocab_size, embedding_dim):
    super(Encoder, self).__init__()
    self.embedding = Embedding(vocab_size, output_dim=embedding_dim)
    self.lstm = LSTM(units, return_state=True, return_sequences=True)                  
                     
  def call(self, data, hidden):
    
    w = self.embedding(data)
    output, state_h, state_c = self.lstm(w, initial_state = hidden)

    return output, state_h, state_c

In [13]:
encoder = Encoder(units, vocab_english, embedding_dim)

In [14]:
class Attention(Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.dense_key = Dense(units)
    self.dense_query = Dense(units)
    self.dense_end = Dense(1)

  def call(self, query, values):

    query_encoded = self.dense_query(tf.expand_dims(query, 1))
    key_encoded = self.dense_key(values)

    final_scores = self.dense_end(
        query_encoded * key_encoded)
    
    attention_weights = tf.nn.softmax(final_scores, axis=1)

    filtered_context = attention_weights * values
    filtered_context = tf.reduce_sum(filtered_context, axis=1)

    return filtered_context, attention_weights

In [15]:
class Decoder(Model):
  def __init__(self, units, vocab_size ,embedding_dim):
    super(Decoder, self).__init__()
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.lstm = LSTM(units, return_sequences=True, return_state=True)
    self.output_layer = Dense(vocab_size)

    self.attention = Attention(30)

  def call(self, data, hidden, encoder_output):

    embedd = self.embedding(data)

    context, _ = self.attention(hidden, encoder_output)
    context = tf.expand_dims(context, axis=1)

    all_data = tf.concat([context, embedd], axis=-1)
    output, state_h, state_c = self.lstm(all_data)

    output = tf.reshape(output, (-1, output.shape[2]))
    pred = self.output_layer(output)    
  
    return pred, state_h, state_c

In [16]:
decoder = Decoder(units, vocab_swedish, embedding_dim)

### Training

In [17]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [18]:
def train_step(inp, target):
  loss = 0

  with tf.GradientTape() as tape:
    hiddens_init = [tf.zeros((BATCH_SIZE , units)), tf.zeros((BATCH_SIZE , units))]
    output, dec_hidden_h, dec_hidden_c = encoder(inp, hiddens_init)

    decoder_first_input = tf.expand_dims([swedish_token.word_index['<start>']] * 64, 1)
    dec_input = decoder_first_input

    for i in range(1, target.shape[1]):
      predictions, dec_hidden_h, dec_hidden_c = decoder(dec_input, tf.concat([dec_hidden_h, dec_hidden_c], axis=-1), output)
      loss += loss_function(target[:, i], predictions)
      
      dec_input = tf.reshape(target[:, i], (64, 1))

  batch_loss = (loss / int(target.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [19]:
for epoch in range(15):
  start = time.time()
  print("Epoch : {}".format(epoch))
  for (batch, (inp, target)) in enumerate(dataset.take(steps_per_epoch)):
    v = train_step(inp, target)
    if batch % 100 == 0:
      print("\tBatch {}, Error : {}".format(batch,v))
  print("\tTemps pour 1 epoch {} sec\n".format(time.time() - start))           

Epoch : 0
	Batch 0, Error : 4.105288982391357
	Batch 100, Error : 2.054779291152954
	Batch 200, Error : 1.9581010341644287
	Temps pour 1 epoch 60.152321338653564 sec

Epoch : 1
	Batch 0, Error : 1.969620943069458
	Batch 100, Error : 1.768649697303772
	Batch 200, Error : 1.4792749881744385
	Temps pour 1 epoch 55.69831037521362 sec

Epoch : 2
	Batch 0, Error : 1.5808970928192139
	Batch 100, Error : 1.5116809606552124
	Batch 200, Error : 1.4190105199813843
	Temps pour 1 epoch 56.16856145858765 sec

Epoch : 3
	Batch 0, Error : 1.339540719985962
	Batch 100, Error : 1.1547611951828003
	Batch 200, Error : 1.3302462100982666
	Temps pour 1 epoch 55.84640169143677 sec

Epoch : 4
	Batch 0, Error : 1.1450202465057373
	Batch 100, Error : 0.9964696168899536
	Batch 200, Error : 1.2382603883743286
	Temps pour 1 epoch 55.94716191291809 sec

Epoch : 5
	Batch 0, Error : 0.9626413583755493
	Batch 100, Error : 0.9303821921348572
	Batch 200, Error : 0.9708364605903625
	Temps pour 1 epoch 56.116392612457275 