In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,LayerNormalization,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input,MultiHeadAttention,Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

#Data Preparation

##Data Download

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2023-11-23 23:53:50--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7757635 (7.4M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-11-23 23:53:50 (19.0 MB/s) - ‘fra-eng.zip’ saved [7757635/7757635]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


## Data Processing

In [4]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [5]:
VOCAB_SIZE=20000
ENGLISH_SEQUENCE_LENGTH=32
FRENCH_SEQUENCE_LENGTH=32
EMBEDDING_DIM=256
BATCH_SIZE=128

In [6]:
english_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [7]:
french_vectorize_layer=TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [8]:
def selector(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

In [9]:
split_dataset=text_dataset.map(selector)

In [10]:
def separator(input_text):
  split_text=tf.strings.split(input_text,'\t')
  return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken'

In [11]:
init_dataset=text_dataset.map(separator)

In [12]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [13]:
english_training_data=init_dataset.map(lambda x,y:x)### input x,y and output x
english_vectorize_layer.adapt(english_training_data)#### adapt the vectorize_layer to the training data

In [14]:
french_training_data=init_dataset.map(lambda x,y:y)### input x,y and output y
french_vectorize_layer.adapt(french_training_data)#### adapt the vectorize_layer to the training data

In [15]:
def vectorizer(inputs,output):
  return {'input_1':english_vectorize_layer(inputs['input_1']),
          'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

In [16]:
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [17]:
dataset=split_dataset.map(vectorizer)

In [18]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [19]:
for i in dataset.take(1):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[  2, 103,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[103,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]])>)


In [20]:
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 32), dtype=tf.int64, name=None))>

In [21]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [22]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 32), dtype=tf.int64, name=None))>

In [23]:
NUM_BATCHES=int(200000/BATCH_SIZE)

In [24]:
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

In [25]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 32), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 32), dtype=tf.int64, name=None))>

In [26]:
#score=tf.einsum('ijk,ibk->ijb',query,key)

# Modeling

## Embedding

In [27]:
def positional_encoding(model_size,SEQUENCE_LENGTH):
  output=[]
  for pos in range(SEQUENCE_LENGTH):
    PE=np.zeros((model_size))
    for i in range(model_size):
      if i%2==0:
        PE[i]=np.sin(pos/(10000**(i/model_size)))
      else:
        PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
    output.append(tf.expand_dims(PE,axis=0))
  out=tf.concat(output,axis=0)
  out=tf.expand_dims(out,axis=0)
  return tf.cast(out,dtype=tf.float32)

In [28]:
print(positional_encoding(256,64).shape)

(1, 64, 256)


In [44]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocab_size, embed_dim,):
    super(Embeddings, self).__init__()
    self.token_embeddings=Embedding(
        input_dim=vocab_size, output_dim=embed_dim)
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim

  def call(self, inputs):
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions=positional_encoding(
        self.embed_dim,self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

In [45]:
test_input=tf.constant([[2,112,10,12,5,0,0,0,]])
emb=Embeddings(8,20000,256)
emb_out=emb(test_input)
print(emb_out.shape)

(1, 8, 256)


In [47]:
mask = emb.compute_mask(test_input)
mask1 = mask[:, :, tf.newaxis]
mask2 = mask[:, tf.newaxis, :]
padding_mask = tf.cast(mask1&mask2, dtype="int32")
print(padding_mask)

tf.Tensor(
[[[1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]], shape=(1, 8, 8), dtype=int32)


## Custom MultiHeadAttention

In [48]:
class CustomSelfAttention(Layer):
  def __init__(self, model_size):
    super(CustomSelfAttention, self).__init__()
    self.model_size=model_size
  def call(self, query, key, value, masking):
    ########### Compute scores
    score = tf.matmul(query, key, transpose_b=True)
    ########### scaling
    score/=tf.math.sqrt(tf.cast(self.model_size, tf.float32))
    ########### masking
    masking=tf.cast(masking, dtype=tf.float32)
    score+=(1.-masking)*-1e10
    ########### atttention_weights
    attention=tf.nn.softmax(score, axis=-1)*masking
    ########### output
    head=tf.matmul(attention, value)
    return head

In [49]:
attention=CustomSelfAttention(256)
attention(tf.ones([1,8,256]), tf.ones([1,8,256]), tf.ones([1,8,256]), padding_mask)

<tf.Tensor: shape=(1, 8, 256), dtype=float32, numpy=
array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)>

In [55]:
class CustomMultiHeadAttention(Layer):
  def __init__(self, num_heads, key_dim):
    super(CustomMultiHeadAttention, self).__init__()

    self.num_heads=num_heads
    self.dense_q=[Dense(key_dim) for _ in range(num_heads)]
    self.dense_k=[Dense(key_dim) for _ in range(num_heads)]
    self.dense_v=[Dense(key_dim) for _ in range(num_heads)]
    self.dense_o=Dense(key_dim)
    self.self_attention=CustomSelfAttention(key_dim)

  def call(self, query, key, value, attention_mask):
    heads=[]

    for i in range(self.num_heads):
      head=self.self_attention(self.dense_q[i](query), self.dense_k[i](key),
                               self.dense_v[i](value), attention_mask)
      heads.append(head)
    heads=tf.concat(heads, axis=2)
    heads=self.dense_o(heads)
    return heads

## Encoder

In [56]:
class TransformerEncoder(Layer):
  def __init__(self, embed_dim, dense_dim, num_heads,):
    super(TransformerEncoder, self).__init__()
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention = CustomMultiHeadAttention(
        num_heads = num_heads, key_dim=embed_dim,
    )
    self.dense_proj=tf.keras.Sequential(
        [Dense(dense_dim, activation="relu"),
         Dense(embed_dim),]
    )
    self.layernorm_1 = LayerNormalization()
    self.layernorm_2 = LayerNormalization()
    self.supports_masking = True

  def call(self, inputs, mask=None):
    print(mask)
    if mask is not None:
      mask = tf.cast(
          mask[:, tf.newaxis, :], dtype="int32")
      T=tf.shape(mask)[2]
      padding_mask = tf.repeat(mask, T, axis=1)
      print(padding_mask)
    attention_output = self.attention(
        query=inputs, key=inputs, value=inputs,
        attention_mask=padding_mask
    )

    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)



In [58]:
encoder_outputs = TransformerEncoder(256, 2048, 2)(emb_out)
print(encoder_outputs.shape)

tf.Tensor([[ True  True  True  True  True False False False]], shape=(1, 8), dtype=bool)
tf.Tensor(
[[[1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]]], shape=(1, 8, 8), dtype=int32)
(1, 8, 256)


## Decoder

In [59]:
print(tf.linalg.band_part(
    tf.ones([1, 8, 8], dtype=tf.int32), -1,0
))

tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)


In [60]:
class TransformerDecoder(Layer):
  def __init__(self, embed_dim, latent_dim, num_heads):
    super(TransformerDecoder, self).__init__()
    self.embed_dim = embed_dim
    self.latent_dim = latent_dim
    self.num_heads = num_heads
    self.attention_1 = CustomMultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.attention_2 = CustomMultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )
    self.dense_proj = tf.keras.Sequential(
        [Dense(latent_dim, activation="relu"), Dense(embed_dim),]
    )
    self.layernorm_1=LayerNormalization()
    self.layernorm_2=LayerNormalization()
    self.layernorm_3=LayerNormalization()
    self.supports_masking = True
  def call(self, inputs, encoder_outputs, enc_mask, mask=None):

    if mask is not None:
      causal_mask=tf.linalg.band_part(
          tf.ones([tf.shape(inputs)[0],
                   tf.shape(inputs)[1],
                   tf.shape(inputs)[1]], dtype=tf.int32), -1, 0)
      mask = tf.cast(
          mask[:, tf.newaxis, :], dtype="int32")
      enc_mask = tf.cast(
          enc_mask[:, tf.newaxis, :], dtype="int32")

      T = tf.shape(mask)[2]
      padding_mask = tf.repeat(mask, T, axis=1)
      cross_attn_mask = tf.repeat(enc_mask, T, axis=1)
      combined_mask = tf.minimum(padding_mask, causal_mask)
      # print('padding', padding_mask)
      # print('causal', causal_mask)
      # print('crossattnmask', cross_attn_mask)
      # print('combinedmask', combined_mask)
    attention_output_1 = self.attention_1(
        query=inputs, key=inputs, value=inputs,
        attention_mask=combined_mask,
    )

    out_1 = self.layernorm_1(inputs + attention_output_1)

    attention_output_2, scores= self.attention_2(
        query=out_1,key=encoder_outputs,value=encoder_outputs,
        attention_mask=cross_attn_mask,
        return_attention_scores=True

    )
    out_2 = self.layernorm_2(out_1 + attention_output_2)

    proj_output = self.dense_proj(out_2)
    return self.layernorm_3(out_2 + proj_output), scores

In [62]:
enc_mask=mask
decoder_outputs, scores = TransformerDecoder(256,2048,4)(emb_out,encoder_outputs,enc_mask)
print(decoder_outputs.shape)

padding tf.Tensor(
[[[1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]]], shape=(1, 8, 8), dtype=int32)
causal tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)
crossattnmask tf.Tensor(
[[[1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]]], shape=(1, 8, 8), dtype=int32)
combinedmask tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 0 0 0]]], shape=(1, 8, 8), dtype=int32)
(1, 8, 256)


## Transformer Model

In [67]:
EMBEDDING_DIM=128
D_FF=1024
NUM_HEADS=8
NUM_LAYERS=1
NUM_EPOCHS=20

In [68]:
encoder_inputs=Input(shape=(None,), dtype="int64", name="input_1")
emb = Embeddings(ENGLISH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)
x = emb(encoder_inputs)
enc_mask = emb.compute_mask(encoder_inputs)

for _ in range(NUM_LAYERS):
  x=TransformerEncoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x)
encoder_outputs=x

decoder_inputs=Input(shape=(None,), dtype="int64", name="input_2")

x = Embeddings(FRENCH_SEQUENCE_LENGTH,VOCAB_SIZE,EMBEDDING_DIM)(decoder_inputs)
for i in range(NUM_LAYERS):
  x=TransformerDecoder(EMBEDDING_DIM,D_FF,NUM_HEADS)(x, encoder_outputs,enc_mask)
x=tf.keras.layers.Dropout(0.5)(x)
decoder_outputs=Dense(VOCAB_SIZE, activation="softmax")(x)

attention_score_model = tf.keras.Model(
    [encoder_inputs, decoder_inputs], attention_scores, name="transformer"
)

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)
transformer.summary()

Tensor("Placeholder_1:0", shape=(None, None), dtype=bool)
Tensor("transformer_encoder_12/Repeat/Reshape_1:0", shape=(None, None, None), dtype=int32)
padding Tensor("transformer_decoder_9/Repeat/Reshape_1:0", shape=(None, None, None), dtype=int32)
causal Tensor("transformer_decoder_9/MatrixBandPart:0", shape=(None, 32, 32), dtype=int32)
crossattnmask Tensor("transformer_decoder_9/Repeat_1/Reshape_1:0", shape=(None, None, None), dtype=int32)
combinedmask Tensor("transformer_decoder_9/Minimum:0", shape=(None, 32, 32), dtype=int32)
Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]      

## Training

In [69]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.bleu_score=0

    def update_state(self,y_true,y_pred,sample_weight=None):
      y_pred=tf.argmax(y_pred,-1)
      self.bleu_score=0
      for i,j in zip(y_pred,y_true):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches=0
        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if word==j[q]:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break

        self.bleu_score+=total_matches/total_words

    def result(self):
        return self.bleu_score/BATCH_SIZE

In [70]:
class Scheduler(LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    super(Scheduler, self).__init__()
    self.d_model = tf.cast(d_model, tf.float64)
    self.warmup_steps = tf.cast(warmup_steps, dtype=tf.float64)

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float64)
    return (self.d_model**(-0.5))*tf.math.minimum(step**(-0.5), step * (self.warmup_steps ** -1.5))

In [71]:
WARM_UP_STEPS = 4000
lr_scheduled = Scheduler(EMBEDDING_DIM, WARM_UP_STEPS)

In [72]:
transformer.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = Adam(lr_scheduled, beta_1=0.9, beta_2=0.98, epsilon=1e-9),)
    #metrics=[BLEU()],
    #run_eagerly=True)

In [73]:
history=transformer.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10)

Epoch 1/10
Tensor("transformer/embeddings_8/NotEqual:0", shape=(None, 32), dtype=bool)
Tensor("transformer/transformer_encoder_12/Repeat/Reshape_1:0", shape=(None, 32, 32), dtype=int32)
padding Tensor("transformer/transformer_decoder_9/Repeat/Reshape_1:0", shape=(None, 32, 32), dtype=int32)
causal Tensor("transformer/transformer_decoder_9/MatrixBandPart:0", shape=(None, 32, 32), dtype=int32)
crossattnmask Tensor("transformer/transformer_decoder_9/Repeat_1/Reshape_1:0", shape=(None, 32, 32), dtype=int32)
combinedmask Tensor("transformer/transformer_decoder_9/Minimum:0", shape=(None, 32, 32), dtype=int32)
Tensor("transformer/embeddings_8/NotEqual:0", shape=(None, 32), dtype=bool)
Tensor("transformer/transformer_encoder_12/Repeat/Reshape_1:0", shape=(None, 32, 32), dtype=int32)
padding Tensor("transformer/transformer_decoder_9/Repeat/Reshape_1:0", shape=(None, 32, 32), dtype=int32)
causal Tensor("transformer/transformer_decoder_9/MatrixBandPart:0", shape=(None, 32, 32), dtype=int32)
cross

In [None]:
transformer.save_weights('/content/drive/MyDrive/transformers.h5')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
transformer.evaluate(val_dataset)

#Testing

In [None]:
index_to_word={x:y for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

In [None]:
def translator(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
    tokenized_shifted_target=french_vectorize_layer([shifted_target])
    output=transformer.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index=tf.argmax(output,axis=-1)[0][i].numpy()
    current_word=index_to_word[french_word_index]
    if current_word=='endtoken':
      break
    shifted_target+=' '+current_word
  return shifted_target[11:]

In [None]:
translator('What makes you think that it is not true?')

# Visualization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def visualize(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken je lai fait très bien'

  tokenized_shifted_target=french_vectorize_layer([shifted_target])
  attention_weights=attention_score_model.predict([tokenized_english_sentence,
                                                   tokenized_shifted_target])

  return attention_weights

out=visualize('I did it very well')


In [None]:
print(out['decoder_layer1_block2'][0].shape)

In [None]:
plt.figure(figsize = (12,12))

for i in range(NUM_HEADS):
  ax = plt.subplot(2,4, i+1)

  plt.imshow(out['decoder_layer1_block2'][0][i][0:10,0:10])
  plt.title("Attention Scores for head:->"+str(i+1))