# Image Caption Encoder and Decoder based model using Transfromer(ViT-BERT) # 

This involves two elements:

1. Vision Transformers (ViT): Transformer based architecture that uses self-attention mechanisms to process images. Using it to extract image features
2. Bidirectional Encoder Representations from Transformers(BERT): It's Large Language Model(LLM) for text data

Help reference on how to train transfromer model from scratch: </br>
i)   https://sushantjha8.medium.com/lets-train-image-to-text-transformer-846150b632ef </br>
ii)  https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97 </br>
iii) https://ankur3107.github.io/blogs/the-illustrated-image-captioning-using-transformers/ </br>


In [None]:
#!pip install rouge_score

In [None]:
import src.utils as plh

PROJECT_ROOT = plh.get_project_root()

# Pre-Processor steps for the image-text data #

Define object to pre-process the inputs

## Pre-Process: Image using ViTImageProcessor ##

In [None]:
from transformers import ViTImageProcessor #preprocessing the input image that will be given as input to model VisionEncoderDecoderModel

image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") #Image processed that was proposed in the original papper by google

## Pre-Process: Text using BertTokenizer ##

In [None]:
from transformers import BertTokenizer #generated target tokens to the target string

# VisionEncoderDecoderModel require to define eos_token_id, decoder_start_token_id , pad_token_id  in the model conifg. 
# When missing these configuration in model.config gave error "Make sure to set the decoder_start_token_id attribute of the model's configuration." during training

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
#PAD Token
(tokenizer.pad_token, tokenizer.pad_token_id)

In [None]:
#CLS Token
(tokenizer.cls_token, tokenizer.cls_token_id)

In [None]:
#SEP Token
(tokenizer.sep_token, tokenizer.sep_token_id)

In [None]:

# By default "bert-base-uncased" config does not return bos_token_id, eos_token_id
# use BERT's [cls] token as beginning of the sentence (BOS) token and [sep] token as end of the sentence (EOS) token. 
# Got this idea solution from the help doc: https://huggingface.co/transformers/v3.3.1/model_doc/bertgeneration.html#bertgenerationtokenizer
# Setting bos_token and bos_token_id from from_pretrained did not help

tokenizer.bos_token = tokenizer.cls_token
tokenizer.bos_token_id = tokenizer.cls_token_id

print('BOS:' + tokenizer.bos_token, tokenizer.bos_token_id)


tokenizer.eos_token = tokenizer.sep_token
tokenizer.eos_token_id = tokenizer.sep_token_id

print('EOS:' + tokenizer.eos_token, tokenizer.eos_token_id)

In [None]:
tokenizer

# Model #

Define the config of the model

In [None]:
from transformers import BertConfig

config_decoder = BertConfig() #Load the based architecture/configuration of Bert model
config_decoder

In [None]:

from transformers import ViTConfig

config_encoder = ViTConfig() #Load the based architecture/configuration of ViT model
config_encoder

In [None]:
from transformers import VisionEncoderDecoderConfig

config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) #Create config Vit-Bert using the configuration of ViT and Bert
config

In [None]:
#For Tensorflow
#from transformers import TFVisionEncoderDecoderModel #Load tensorflow based VisionEncoderDecoderModel. Model to generate text from image
#model = TFVisionEncoderDecoderModel(config = config) #Build the model using the config

#For pytorch
from transformers import VisionEncoderDecoderModel
model = VisionEncoderDecoderModel(config = config)

### Update the model config with the token ###

In [None]:
(tokenizer.pad_token , tokenizer.eos_token)

In [None]:
(tokenizer.pad_token_id , tokenizer.eos_token_id)

In [None]:
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

#model.config.decoder.eos_token_id = tokenizer.eos_token_id
#model.config.decoder.decoder_start_token_id = tokenizer.bos_token_id
#model.config.decoder.pad_token_id = tokenizer.pad_token_id

#model.config.encoder.eos_token_id = tokenizer.eos_token_id
#model.config.encoder.decoder_start_token_id = tokenizer.bos_token_id

In [None]:
#https://discuss.huggingface.co/t/error-training-vision-encoder-decoder-for-image-captioning/12090/6

#model.resize_token_embeddings(len(tokenizer))

# Load data #

Load the data from parquet file into dataset object and define class to apply pre-process on the data's.

HuggingFace dataset: https://huggingface.co/docs/datasets/loading#parquet

In [None]:
from datasets import load_dataset
import os


#Load from parquet file into huggingface dataset
base_path = os.path.join(PROJECT_ROOT, 'data', 'processed')
train_data = 'train_data_processed.parquet'
valid_data = 'validate_data_processed.parquet'

data_files = {"train": os.path.join(base_path, train_data), 'valid': os.path.join(base_path, valid_data)}
db_set = load_dataset("parquet", data_files = data_files)

db_set = db_set.remove_columns(['id', 'title', 'color', 'clean_title', 'clean_color', '__index_level_0__'])

db_set = db_set.map(lambda dbrow: { "image_path": os.path.join(PROJECT_ROOT, 'data', 'images', dbrow["image_name"]) }, remove_columns = ["image_name"])

db_set


In [None]:
db_set['train'].num_rows

In [None]:
db_set['train'][0]

In [None]:
#look at top 5 records in the train dataset
db_set['train']["image_path"][:5]

In [None]:
#Apply the pre-process on image and text on the dataset
#import tensorflow as tf
from torch.utils.data import Dataset
from PIL import Image

class captiondatset(Dataset):

    def __init__(self, datasets, length):
        self.datasets = datasets
        self.length = length

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        
        image = Image.open(self.datasets['image_path'][idx])
        image_features = image_processor(image, return_tensors = "pt").pixel_values #tf

        labels = tokenizer(self.datasets["caption"][idx], 
                           return_tensors = "pt", #tf
                           max_length = 15,
                           padding = 'max_length',
                           return_token_type_ids = True,
                           truncation = True).input_ids

        #return {'pixel_values': tf.squeeze(image_features), 'labels': tf.squeeze(labels)}
        return {'pixel_values': image_features.squeeze(0), 'labels': labels.squeeze(0)}

In [None]:
datsets_train = captiondatset(db_set['train'], db_set['train'].num_rows)

In [None]:
#Check the transformed output at row idx 0
datsets_train.__getitem__(0)

In [None]:
datsets_train.__getitem__(16)['labels']

In [None]:
dataset_val = captiondatset(db_set['valid'], db_set['valid'].num_rows) 

In [None]:
import gc

del [db_set]
gc.collect()

# Evaluate #

Define evaluate metric for the model.

In [None]:
from rouge_score import rouge_scorer, scoring

rouge_types = ["rouge1", "rouge2", "rougeL"]
use_stemmer = False

rouge_score_obj = rouge_scorer.RougeScorer(rouge_types = rouge_types, use_stemmer = use_stemmer)
aggregator = scoring.BootstrapAggregator()

def compute_metrics(pred):

    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens = True)

    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens = True)

    for ref, pred in zip(label_str, pred_str):

        output_score = rouge_score_obj.score(prediction = pred, target = ref)
        aggregator.add_scores(output_score)

    result = aggregator.aggregate()
    
    return {
        "rouge1_fmeasure": round(result['rouge1'].mid.fmeasure, 2),
        "rouge2_fmeasure": round(result['rouge2'].mid.fmeasure, 2),
        "rougeL_fmeasure": round(result['rougeL'].mid.fmeasure, 2),
    }


# Training #

### Training Arguments ###

Help Refer: https://huggingface.co/transformers/v3.5.1/main_classes/trainer.html#transformers.TrainingArguments

In [None]:
#Where model checkpoints will be saved.
output_dir = './data/vit_bert/'


# evaluating the trained model on the evaluation set every eval_steps training steps 
# evaluation_strategy (default "no"):
# Possible values are:
# "no": No evaluation is done during training.
# "steps": Evaluation is done (and logged) every eval_steps paramater.
# "epoch": Evaluation is done at the end of each epoch.   
evaluation_strategy = 'steps'
eval_steps = 6


# logging_strategy (default: "steps"): The logging strategy to adopt during. These writen training logs will be used by TensorBoard visualized.
# training (used to log training loss for example). Possible values are:
# "no": No logging is done during training.
# "epoch": Logging is done at the end of each epoch.
# "steps": Logging is done every logging_steps.
logging_strategy = "steps"
# logging_steps (default 500): Number of update steps between two logs if
# logging_strategy="steps".
logging_steps = 5


# Save the trained model.
# The checkpoint save strategy to adopt during training. Possible values are:
# "no": No save is done during training.
# "epoch": Save is done at the end of each epoch.
# "steps": Save is done every save_steps (default 500).
save_strategy = "epoch"
# save_steps (default: 500): Number of updates steps before two checkpoint
# saves if save_strategy="steps".
#save_steps = 200


learning_rate = 2e-4


# per_device_train_batch_size: The batch size per GPU/TPU core/CPU for training.
per_device_train_batch_size = 32 #4
gradient_accumulation_steps = 16 #When use gradient_accumulation_steps with per_device_train_batch_size it has same effective batch size 64 when per_batch_size = 4 and grad_steps = 16

# per_device_eval_batch_size: The batch size per GPU/TPU core/CPU for evaluation.
per_device_eval_batch_size = 32


fp16 = True


# Total number of training epochs to perform
num_train_epochs = 3
    

# load_best_model_at_end (default False): Whether or not to load the best model
# found during training at the end of training.
load_best_model_at_end = False


# metric_for_best_model:
# Use in conjunction with load_best_model_at_end to specify the metric to use
# to compare two different models. Must be the name of a metric returned by
# the evaluation with or without the prefix "eval_".
metric_for_best_model = compute_metrics
 
# report_to:
# The list of integrations to report the results and logs to. Supported
# platforms are "azure_ml", "comet_ml", "mlflow", "tensorboard" and "wandb".
# Use "all" to report to all integrations installed, "none" for no integrations.
report_to = None #"tensorboard". 


from transformers import TrainingArguments

training_args = TrainingArguments(
                                output_dir = output_dir,         
                                                                
                                learning_rate = learning_rate,

                                num_train_epochs = num_train_epochs,                  

                                per_device_train_batch_size = per_device_train_batch_size,       
                                gradient_accumulation_steps = gradient_accumulation_steps,
                                per_device_eval_batch_size = per_device_eval_batch_size,        
                                
                                fp16 = fp16,

                                evaluation_strategy = evaluation_strategy,
                                eval_steps = eval_steps,

                                logging_strategy  = logging_strategy,
                                #logging_dir = './data/logs',
                                logging_steps  = logging_steps, 

                                save_strategy  = save_strategy,

                                load_best_model_at_end = load_best_model_at_end,    

                                report_to = report_to,                           
                                
)

In [None]:
# Start TensorBoard before training to monitor it in progress

#%load_ext tensorboard
#%tensorboard --logdir output_dir


### Train ###

Help refer: https://huggingface.co/docs/transformers/training

In [None]:
#Training

#For tensorflow

#import tensorflow as tf
#devices = tf.config.experimental.list_physical_devices('GPU')

#For Pytorch
import torch

devices = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(devices)
model.train()

In [None]:
from transformers import default_data_collator
from transformers import Trainer

#For train tensorflow: using compile and fit methods of tf.

#Train with PyTorch use Trainer Class Object
trainer = Trainer(
                model = model,                         # the instantiated Transformers model to be trained
                args = training_args,                  # training arguments, defined above
                train_dataset = datsets_train,         # training dataset
                eval_dataset = dataset_val,             # evaluation dataset
                compute_metrics = metric_for_best_model,
                data_collator = default_data_collator,
)

In [None]:
os.environ['WANDB_DISABLED'] = 'True' #When training in Kaggel it was getting connected to WANDB.io and was asking for API

In [None]:
trainer.train()