# <center>Image captioning - Transformer model</center>

## Các tài liệu nghiên cứu thêm:
 - EfficientNetB2:
 - Swin Transformer: https://viblo.asia/p/paper-explain-swin-transformer-hierarchical-vision-transformer-using-shifted-windows-L4x5xqxmKBM
 

In [1]:
%pip install -r requirements.txt




In [2]:
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
from collections import defaultdict
from tqdm import tqdm
import time

%matplotlib inline

## 1. Data preparation

### 1.1. Load the cations

In [3]:
SEQ_START = '<start>'
SEQ_END = '<end>'

In [4]:
BASE = os.path.join('.','Flickr8k')
IMAGE_PATH = os.path.join(BASE, 'Flickr8k_Dataset')
CAPTION_PATH = os.path.join(BASE, 'Flickr8k_text')
CAPTION_FULL = os.path.join(CAPTION_PATH, 'Flickr8k.token.txt')

In [5]:
captions_map = defaultdict(list)
with open(CAPTION_FULL) as file:
    lines = file.readlines()
    for line in lines:
        data = line.split('\t')
        image_id = data[0].split('#')[0]
        caption = SEQ_START+ ' ' + data[1].strip()+' '+SEQ_END
        if not os.path.exists(os.path.join(IMAGE_PATH, image_id+'.npy')):
            continue
        captions_map[image_id].append(caption)

In [6]:
len(list(captions_map.keys()))

2477

In [7]:
all_captions = []
all_image_paths = []
for image_id in captions_map:
    all_captions.extend(captions_map[image_id])
    all_image_paths.extend([os.path.join(IMAGE_PATH, image_id)]* len(captions_map[image_id]))

In [8]:
all_captions[:10]

['<start> A child in a pink dress is climbing up a set of stairs in an entry way . <end>',
 '<start> A girl going into a wooden building . <end>',
 '<start> A little girl climbing into a wooden playhouse . <end>',
 '<start> A little girl climbing the stairs to her playhouse . <end>',
 '<start> A little girl in a pink dress going into a wooden cabin . <end>',
 '<start> A black dog and a spotted dog are fighting <end>',
 '<start> A black dog and a tri-colored dog playing with each other on the road . <end>',
 '<start> A black dog and a white dog with brown spots are staring at each other in the street . <end>',
 '<start> Two dogs of different breeds looking at each other on the road . <end>',
 '<start> Two dogs on pavement moving toward each other . <end>']

In [9]:
all_image_paths[:10]

['.\\Flickr8k\\Flickr8k_Dataset\\1000268201_693b08cb0e.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1000268201_693b08cb0e.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1000268201_693b08cb0e.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1000268201_693b08cb0e.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1000268201_693b08cb0e.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1001773457_577c3a7d70.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1001773457_577c3a7d70.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1001773457_577c3a7d70.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1001773457_577c3a7d70.jpg',
 '.\\Flickr8k\\Flickr8k_Dataset\\1001773457_577c3a7d70.jpg']

### 1.2. Tokenize captions

In [10]:
caption_dataset = tf.data.Dataset.from_tensor_slices(all_captions)

In [11]:
def standardize(input):
    input = tf.strings.lower(input)
    return tf.strings.regex_replace(input, r"!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~", "")

In [12]:
# Các tham số cho tokenizer(thuật toán tách từ)
MAX_LENGTH = 50
VOCAB_SIZE = 5000

# Tokenizer
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens = VOCAB_SIZE,
    standardize = standardize,
    output_sequence_length = MAX_LENGTH
)

# Learn the vocabulary from the caption data.
tokenizer.adapt(caption_dataset)

In [13]:
caption_vectors = caption_dataset.map(lambda x: tokenizer(x))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [14]:
for cap in caption_vectors.take(4):
    print(cap)

tf.Tensor(
[   3    2   44    6    2   91  157    9   99   49    2  361   14  385
    6   31 4553  509    5    4    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0], shape=(50,), dtype=int64)
tf.Tensor(
[  3   2  19 320  61   2 183 118   5   4   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(50,), dtype=int64)
tf.Tensor(
[   3    2   39   19   99   61    2  183 2513    5    4    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0], shape=(50,), dtype=int64)
tf.Tensor(
[   3    2   39   19   99    7  385   20   59 2513    5    4    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    

In [15]:
word_to_index = tf.keras.layers.StringLookup(
    mask_token = '',
    vocabulary = tokenizer.get_vocabulary()
)
index_to_word = tf.keras.layers.StringLookup(
    mask_token = '',
    vocabulary = tokenizer.get_vocabulary(),
    invert= True
)

### 1.3. Image feature extraction

In [16]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (260, 260))  # EfficientNetB2 expects this input shape
    img = tf.keras.applications.efficientnet.preprocess_input(img)
    return img, image_path

In [17]:
image_model = tf.keras.applications.EfficientNetB2(
    include_top = False,
    weights = 'imagenet'
)
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [18]:
unique_image_paths = sorted(set(all_image_paths))

In [19]:
image_dataset = tf.data.Dataset.from_tensor_slices(unique_image_paths)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE).batch(16)
try:
    for image, path in tqdm(image_dataset):
        # batch_features shape == (16, 8, 8, 1408) (16 is batch size)
        batch_features = image_features_extract_model(image)
        # after reshaping, batch_features shape == (16, 64, 1408)
        batch_features = tf.reshape(batch_features,(batch_features.shape[0], -1, batch_features.shape[3]))

        # store the features to a numpy file
        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())
except:
    pass

100%|██████████| 155/155 [03:03<00:00,  1.18s/it]


### 1.4. Split the data into training, validation


In [20]:
image_to_caption_vectors = defaultdict(list)
for image_path, caption in zip(all_image_paths, caption_vectors):
    image_to_caption_vectors[image_path].append(caption)

In [21]:
#image_keys = list(image_to_caption_vectors.keys())
random.shuffle(unique_image_paths)
slice_index = int(len(unique_image_paths) * 0.8)
train_paths, val_paths = unique_image_paths[:slice_index], unique_image_paths[slice_index:]

In [22]:
image_train_paths =[]
caption_train = []
for path in train_paths:
    caption_len = len(image_to_caption_vectors[path])
    image_train_paths.extend([path] * caption_len)
    caption_train.extend(image_to_caption_vectors[path])

image_val_paths = []
caption_val = []
for path in val_paths:
    caption_len = len(image_to_caption_vectors[path])
    image_val_paths.extend([path] * caption_len)
    caption_val.extend(image_to_caption_vectors[path])

len(image_train_paths), len(caption_train), len(image_val_paths), len(caption_val)

(9905, 9905, 2480, 2480)

### 1.5. Create tf.data dataset for training

In [23]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000

In [24]:
def map_func(image_path, caption):
    img_tensor = np.load(image_path.decode('utf-8') + '.npy')
    return img_tensor, caption

In [25]:
dataset = tf.data.Dataset.from_tensor_slices((image_train_paths, caption_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
                            map_func, [item1, item2], [tf.float32, tf.int64]
                      )
                      , num_parallel_calls=tf.data.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

## 2. Training model

### 2.1. Transfromer model

In [26]:
num_layer = 4
emb_dim = 512
fc_dim = 2048
num_heads = 8
row_size = 8
col_size = 8
target_vocab_size = VOCAB_SIZE
dropout_rate = 0.1

In [27]:
from transformer_model import Transformer
transformer = Transformer(num_layers=num_layer,
                          emb_dim=emb_dim,
                          num_heads=num_heads,
                          fc_dim=fc_dim,
                          row_size=row_size,
                          col_size=col_size,
                          target_vocab_size=VOCAB_SIZE,
                          dropout_rate=dropout_rate)

### 2.2. Custom Learning Rate Schedule

In [28]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [29]:
learning_rate = CustomSchedule(512.0)

In [30]:
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

InvalidArgumentError: Value for attr 'T' of int64 is not in the list of allowed values: bfloat16, half, float, double, complex64, complex128
	; NodeDef: {{node Rsqrt}}; Op<name=Rsqrt; signature=x:T -> y:T; attr=T:type,allowed=[DT_BFLOAT16, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128]> [Op:Rsqrt]

### 2.3.Lost function

In [31]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [32]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

### 2.4. Checkpoint

In [33]:
checkpoint_path = './checkpoints/train'

ckpt = tf.train.Checkpoint(transformer=transformer,optimizer=optimizer)
#ckpt = tf.train.Checkpoint(transformer=transformer)


ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# If a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

NameError: name 'optimizer' is not defined

### 2.5. Training step

In [34]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
loss_plot = []

In [35]:
from mask import create_look_ahead_mask
@tf.function()
def train_step(input_tensor, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar_inp)[1])
    dec_padding_mask = None

    with tf.GradientTape() as tape:
        predictions, _ = transformer(input_tensor, tar_inp, True, None, look_ahead_mask, dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    train_loss(loss)

In [36]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    start = time.time()

    train_loss.reset_states()

    for (batch, (img_tensor, tar)) in enumerate(dataset):
        train_step(img_tensor, tar)

        if batch % 50 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(
                epoch + 1, batch, train_loss.result()))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    loss_plot.append(train_loss.result())

    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

    print('------------------------------------')

NameError: in user code:

    File "C:\Users\Admin\AppData\Local\Temp\ipykernel_3136\1942399665.py", line 15, in train_step  *
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    NameError: name 'optimizer' is not defined


In [None]:
def generate(image_path):
    if 'https://' in image_path:
        image_extension = image_path[-4:]
        image_path = tf.keras.utils.get_file('image' + image_extension, origin=image_path)

    result, _ = evaluate(image_path)
    print('Prediction Caption:', ' '.join(result))
    # opening the image
    plt.imshow(plt.imread(image_path))
    plt.show()