In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import numpy as np
import time

### Read data

In [None]:
file = open("drive/MyDrive/data/fra.txt", 'r')
lines = file.readlines()

In [None]:
eng_fr = np.array([line.split("\t")[0:2] for line in lines])
english = eng_fr[:, 0]
french = eng_fr[:, 1]

### Preprocessing

In [None]:
def punct(elt):
  elts = re.sub("(?<=.)!", " !", elt)
  elts = re.sub("(?<=.)\?", " ?", elts)
  elts = re.sub("(?<=.)\.", " .", elts)
  elts = re.sub("(?<=.),", " ,", elts)
  elts = re.sub("(?<=.);", " ;", elts)
  return elts

def start_end(elt):
  return "<start> " + elt + " <end>"

In [None]:
english_cleaned = [start_end(punct(elt)) for elt in english]
french_cleaned = [start_end(punct(elt)) for elt in french]

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
english_token = Tokenizer(filters='')
english_token.fit_on_texts(english_cleaned)

french_token = Tokenizer(filters='')
french_token.fit_on_texts(french_cleaned)

In [None]:
english_sentences = english_token.texts_to_sequences(english_cleaned)
french_sentences = french_token.texts_to_sequences(french_cleaned)

english_sentences = english_sentences[0:15000]
french_sentences = french_sentences[0:15000]

In [None]:
english_sentences = pad_sequences(english_sentences, padding='post')
french_sentences = pad_sequences(french_sentences, padding='post')

### Tf dataset

In [None]:
BUFFER_SIZE = len(english_sentences)
BATCH_SIZE = 100
vocab_english = len(english_token.word_index)+1
vocab_french = len(french_token.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((english_sentences, french_sentences)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([100, 8]), TensorShape([100, 14]))

### Models

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout, Dense, Embedding, LayerNormalization
from tensorflow.keras.layers import Layer

#### Masks

In [None]:
def look_ahead_mask(shape):
  return 1 - tf.linalg.band_part(tf.ones(shape), -1, 0) 

def create_padding_mask(seq):
  casted = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return casted[:, None, None, :]

#### Scaled dot product multihead attention

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)


In [None]:
def attention(k, v, q, mask):

  attention_scores = tf.linalg.matmul(q, k, transpose_b=True)
  scaled_attention_scored = (attention_scores / k.shape[-1]) + (mask * -1e9)
  attention_weights = tf.nn.softmax(scaled_attention_scored, axis=-1)

  output = tf.matmul(attention_weights, v)

  return output

In [None]:
class MultiHeadAttention(Layer):

  def __init__(self, units, heads):
    super(MultiHeadAttention, self).__init__()
    self.v = Dense(units)
    self.q = Dense(units)
    self.k = Dense(units)
    self.units = units
    self.heads = heads
    self.depth = self.units // self.heads

  def split_head(self, data):
    data = tf.reshape(data, (data.shape[0], -1, self.heads, self.depth))
    return tf.transpose(data, perm=[0, 2, 1, 3])

  def call(self, value, query, mask):

    key_encoded = self.k(value)
    query_encoded = self.q(query)
    value_encoded = self.v(value)

    key_encoded = self.split_head(key_encoded)
    query_encoded = self.split_head(query_encoded)
    value_encoded = self.split_head(value_encoded)
    
    output = attention(key_encoded, value_encoded, query_encoded, mask)
    #reshape output
    output = tf.transpose(output, perm=[0, 2, 1, 3])
    output = tf.reshape(output, (output.shape[0], -1, self.units))

    return output

#### Encoder

In [None]:
class Encoder_layer(Layer):
  def __init__(self, units, num_heads):
    super(Encoder_layer, self).__init__()
    self.self_attention = MultiHeadAttention(units, num_heads)
    self.norm_1 = LayerNormalization(epsilon=1e-6)
    self.norm_2 = LayerNormalization(epsilon=1e-6)
    self.feed_frwd_1 = Dense(100, activation="relu")
    self.feed_frwd_2 = Dense(units)
    self.dropout_1 = Dropout(0.1)
    self.dropout_2 = Dropout(0.1)


  def call(self, data, encoder_pad_mask):

    self_att_enc = self.self_attention(data, data, encoder_pad_mask)    

    self_att_enc = self.dropout_2(self_att_enc)

    normalised_out = self.norm_1(data + self_att_enc)

    fc = self.feed_frwd_1(normalised_out)
    fc2 = self.feed_frwd_2(fc)
    fc2 = self.dropout_2(fc2)
    output = self.norm_1(fc2 + normalised_out)

    return output

In [None]:
class Encoder(Layer):

  def __init__(self, units, num_heads, vocab_size, embedding_dim, num_layers):
    super(Encoder, self).__init__()
    self.units = units
    self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.embedding_dim = embedding_dim
    self.num_layers = num_layers
    self.encoders = [Encoder_layer(units, num_heads) for i in range(self.num_layers)]

  def call(self, data, encoder_pad_mask):
    data = self.embedding(data)
    data = data + positional_encoding(data.shape[1], self.embedding_dim)
    
    for i in range(self.num_layers):
      data = self.encoders[i](data, encoder_pad_mask)

    return data

In [None]:
enc_pad_mask = create_padding_mask(example_input_batch)
encoder = Encoder(168, 8, vocab_english, 168, 4)
encoded = encoder(example_input_batch, enc_pad_mask)

In [None]:
encoded.shape

TensorShape([100, 8, 168])

#### Decoder

In [None]:
class Decoder_layer(Layer):
  def __init__(self, units, num_heads):
    super(Decoder_layer, self).__init__()
    self.self_attention_1 = MultiHeadAttention(units, num_heads)
    self.self_attention_2 = MultiHeadAttention(units, num_heads)
    self.norm_1 = LayerNormalization()
    self.norm_2 = LayerNormalization()
    self.norm_3 = LayerNormalization()
    self.dropout_1 = Dropout(0.1)
    self.dropout_2 = Dropout(0.1)
    self.dropout_3 = Dropout(0.1)
    self.feed_frwd_1 = Dense(100, activation="relu")
    self.feed_frwd_2 = Dense(units)

  def call(self, target_input, encoder_input, combined_mask, padding_mask):

    self_att_target = self.self_attention_1(target_input, target_input, combined_mask)
    self_att_target = self.dropout_1(self_att_target)
    normalised_out = self.norm_1(target_input + self_att_target)
    
    encoder_decoder_attention = self.self_attention_2(encoder_input, normalised_out, padding_mask)
    encoder_decoder_attention = self.dropout_2(encoder_decoder_attention)
    normalised_2_out = self.norm_2(encoder_decoder_attention + normalised_out)
    
    fc1 = self.feed_frwd_1(normalised_2_out)
    fc2 = self.feed_frwd_2(fc1)
    fc2 = self.dropout_3(fc2)

    encoder_decoder_attention = self.norm_3(fc2 + normalised_2_out)

    return encoder_decoder_attention


In [None]:
class Decoder(Layer):

  def __init__(self, units, num_heads, vocab_size, embedding_dim, num_layers):
    super(Decoder, self).__init__()
    self.units = units
    self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.embedding_dim = embedding_dim
    self.num_layers = num_layers
    self.decoder = [Decoder_layer(units, num_heads) for i in range(self.num_layers)]

  def call(self, data_encoded, data_target, combined_mask, pad_mask):
    
    data = self.embedding(data_target)
    
    data = data + positional_encoding(data.shape[1], self.embedding_dim)
    
    for i in range(self.num_layers):
      data = self.decoder[i](data, data_encoded, combined_mask, pad_mask)
    
    return data

In [None]:
dec_pad_mask = create_padding_mask(example_target_batch[:, :-1])
l_h_mask = look_ahead_mask((13, 13))
combined_mask = tf.math.maximum(dec_pad_mask, l_h_mask)

In [None]:
decoder = Decoder(168, 8, vocab_french, 168, 4)
dec = decoder(encoded, example_target_batch[:, :-1], combined_mask, enc_pad_mask)
dec.shape

TensorShape([100, 13, 168])

#### Transformer class

In [None]:
class Transformer(Model):

  def __init__(self, units, num_heads, inp_vocab, embedding_dim, num_layers, targ_vocab):
    super(Transformer, self).__init__()
    
    self.encoder = Encoder(units, num_heads, inp_vocab, embedding_dim, num_layers)
    self.decoder = Decoder(units, num_heads, targ_vocab, embedding_dim, num_layers)    
    self.out = Dense(targ_vocab, activation="softmax")
    
  def call(self, input_encoder, input_decoder, encoder_pad_mask, combined_mask):
    
    encoded_data = self.encoder(input_encoder, encoder_pad_mask)
    decoded_data = self.decoder(encoded_data, input_decoder, combined_mask, encoder_pad_mask)
    out = self.out(decoded_data)
    
    return out

In [None]:
EMBEDDING_DIM = 168
UNITS = 168
NUM_LAYERS = 6
NUM_HEADS = 8

In [None]:
transformer = Transformer(UNITS, NUM_HEADS, vocab_english, EMBEDDING_DIM, NUM_LAYERS, vocab_french)
transformed = transformer(example_input_batch, example_target_batch[:, :-1], enc_pad_mask, combined_mask)
transformed.shape

TensorShape([100, 13, 34403])

In [None]:
combined_mask.shape

TensorShape([100, 1, 13, 13])

### Train

In [None]:
from tensorflow.keras.optimizers.schedules import LearningRateSchedule

In [None]:
warmup_steps = 4000

In [None]:
class Warmup_lr(LearningRateSchedule):

  def __call__(self, step):
    return (UNITS ** -0.5) * tf.math.minimum(step ** -0.5, step * (warmup_steps ** -1.5))

learning_rate = Warmup_lr()

optimizer = tf.keras.optimizers.Adam(learning_rate)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_fx(real, predicted):
  categorical_loss = loss_object(real, predicted)
  mask = tf.cast(
      tf.math.not_equal(real, 0),
      tf.float32
  )
  return tf.reduce_sum(categorical_loss * mask) / tf.reduce_sum(mask)

In [None]:
example_real = tf.stack([1,0,0])
example_target = tf.stack([[0.,1.],[0.,1.],[1.,0.]])

In [None]:
loss_fx(example_real, example_target)

<tf.Tensor: shape=(), dtype=float32, numpy=0.31326166>

In [None]:
def train_step(input_text, target):
  tar = target[:, 1:]
  input_dec = target[:, :-1]

  enc_pad_mask = create_padding_mask(input_text)
  dec_pad_mask = create_padding_mask(input_text)

  l_h_mask = look_ahead_mask((input_dec.shape[1], input_dec.shape[1]))
  dec_target_padding_mask = create_padding_mask(input_dec)

  combined_mask = tf.math.maximum(dec_target_padding_mask, l_h_mask)

  with tf.GradientTape() as tape:
    output = transformer(input_text, input_dec, enc_pad_mask, combined_mask)
    loss = loss_fx(tar, output)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

  return tf.reduce_mean(loss)

In [None]:
for epoch in range(50):
  start = time.time()    
  for batch, (inp, tar) in enumerate(dataset):
    mean_loss = train_step(inp, tar)
    if batch % 50 == 0:
      print("batch {}, loss {}".format(batch, mean_loss))
  
  print("Epoch {} time elapsed {} seconds".format(epoch, time.time()-start))

batch 0, loss 10.431745529174805
batch 50, loss 10.231319427490234
batch 100, loss 9.832564353942871
Epoch 0 time elapsed 56.67358183860779 seconds
batch 0, loss 9.30780029296875
batch 50, loss 8.635746955871582
batch 100, loss 7.873263359069824
Epoch 1 time elapsed 56.66216468811035 seconds
batch 0, loss 7.034322738647461
batch 50, loss 6.222867965698242
batch 100, loss 5.588581562042236
Epoch 2 time elapsed 56.23841905593872 seconds
batch 0, loss 5.136653423309326
batch 50, loss 4.826890468597412
batch 100, loss 4.596340179443359
Epoch 3 time elapsed 56.04957365989685 seconds
batch 0, loss 4.375748634338379
batch 50, loss 4.209961414337158
batch 100, loss 4.079482078552246
Epoch 4 time elapsed 56.05132746696472 seconds
batch 0, loss 3.848883867263794
batch 50, loss 3.802645206451416
batch 100, loss 3.7814507484436035
Epoch 5 time elapsed 55.793057680130005 seconds
batch 0, loss 3.3991098403930664
batch 50, loss 3.338036060333252
batch 100, loss 3.4794554710388184
Epoch 6 time elapsed

### Predictions

In [None]:
example_input_batch[0]

<tf.Tensor: shape=(8,), dtype=int32, numpy=array([   1,  212,    9, 2922,    3,    2,    0,    0], dtype=int32)>

In [None]:
# Masks : 

def translate(sentence):
    
  test = ["<start>" + " " + sentence + " " + "<end>"]
  tokenized_sent = tf.convert_to_tensor(english_token.texts_to_sequences(test))

  max_size = 30
  output = tf.convert_to_tensor([1])
  output = tf.expand_dims(output, 0)


  for i in range(15):
    enc_pad_mask = create_padding_mask(tokenized_sent)
    dec_pad_mask = create_padding_mask(tokenized_sent)

    l_h_mask = look_ahead_mask((output.shape[1], output.shape[1]))
    dec_target_padding_mask = create_padding_mask(output)

    combined_mask = tf.math.maximum(dec_target_padding_mask, l_h_mask)

    prediction = transformer(tokenized_sent, output, enc_pad_mask, combined_mask)

    id_pred = tf.argmax(prediction[..., -1:, :], axis=-1, output_type=tf.int32)
    output = tf.concat([output, id_pred], axis=-1)
    
  return french_token.sequences_to_texts(output.numpy())

In [None]:
translate("I am pretty")

['<start> je suis jolie . <end> <end> <end> . <end> <end> <end> <end> <end> <end> <end>']