# Deep Learning Term Project Part-B

## Installing Dependencies

In [2]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
!pip install pycocoevalcap

Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m225.3/297.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

#### Required Libraries

In [3]:
import os
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset
from transformers import ViTImageProcessor, BertTokenizer

### Importing Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!unzip '/content/drive/MyDrive/VIT/custom_captions_dataset.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: custom_captions_dataset/train/train_3851.jpg  
  inflating: custom_captions_dataset/train/train_3407.jpg  
  inflating: custom_captions_dataset/train/train_871.jpg  
  inflating: custom_captions_dataset/train/train_3642.jpg  
  inflating: custom_captions_dataset/train/train_347.jpg  
  inflating: custom_captions_dataset/train/train_2152.jpg  
  inflating: custom_captions_dataset/train/train_4825.jpg  
  inflating: custom_captions_dataset/train/train_1604.jpg  
  inflating: custom_captions_dataset/train/train_638.jpg  
  inflating: custom_captions_dataset/train/train_1818.jpg  
  inflating: custom_captions_dataset/train/train_4788.jpg  
  inflating: custom_captions_dataset/train/train_1734.jpg  
  inflating: custom_captions_dataset/train/train_918.jpg  
  inflating: custom_captions_dataset/train/train_2258.jpg  
  inflating: custom_captions_dataset/train/train_3745.jpg  
  inflating: custom_captions_dataset/tr

### Creating dataset

In [6]:
from torch.utils.data import Dataset

# Defining class for Image Captioning Dataset

class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, max_length=128):
        self.img_labels = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
        self.max_length = max_length

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 1])
        image = Image.open(img_path).convert('RGB')
        caption = self.img_labels.iloc[idx, 2]

        if self.transform:
            image = self.transform(image)
        else:
            image = self.image_processor(images=image, return_tensors="pt").pixel_values.squeeze(0)

        tokens = self.tokenizer(caption, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        input_ids = tokens.input_ids.squeeze(0)

        return {
            'pixel_values': image,
            'labels': input_ids,
        }


In [7]:
# Using the ImageCaptionDataset in the train and validation dataset

train_dataset = ImageCaptionDataset(csv_file='/content/custom_captions_dataset/train.csv', img_dir='/content/custom_captions_dataset/train/')
val_dataset = ImageCaptionDataset(csv_file='/content/custom_captions_dataset/val.csv', img_dir='/content/custom_captions_dataset/val/')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

## Defining Model

In [8]:
# Specifying device

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel
from datasets import load_dataset

image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained("google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased")

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bi

## Training and Validation

In [10]:
# Specifying training arguments for the model

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=50,
    logging_dir='./logs',
    logging_steps=10,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=None,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
# Training the model

trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss
50,7.0843,6.2539
100,4.1513,3.718395
150,2.784,2.435906
200,1.9002,1.820117
250,1.8431,1.665628
300,1.6655,1.590443
350,1.5317,1.546921
400,1.4809,1.524433
450,1.4557,1.503455
500,1.5605,1.483548


There were missing keys in the checkpoint model loaded: ['decoder.cls.predictions.decoder.weight', 'decoder.cls.predictions.decoder.bias'].


TrainOutput(global_step=1790, training_loss=1.7048181310046318, metrics={'train_runtime': 3776.9962, 'train_samples_per_second': 7.566, 'train_steps_per_second': 0.474, 'total_flos': 5.172825055897498e+18, 'train_loss': 1.7048181310046318, 'epoch': 5.0})

## Evaluating Model

In [13]:
model = model.eval()

In [14]:
# Class for generating function

def generate_caption(image_path, model, image_processor, tokenizer, device):

    image = Image.open(image_path).convert('RGB')
    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(device)

    outputs = model.generate(pixel_values,
                             max_length=128,
                             decoder_start_token_id=model.config.decoder_start_token_id)

    caption = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return caption[0]

In [15]:
test_image_dir = '/content/custom_captions_dataset/test/'
test_labels = pd.read_csv('/content/custom_captions_dataset/test.csv')

In [16]:
# Generating captions from the test set using the trained model

predictions = []
for idx in tqdm(range(len(test_labels)), desc="Generating captions"):
    img_path = os.path.join(test_image_dir, test_labels.iloc[idx, 1])
    pred = generate_caption(img_path, model, image_processor, tokenizer, device)
    predictions.append(pred)

Generating captions: 100%|██████████| 928/928 [31:02<00:00,  2.01s/it]


In [17]:
test_labels['Predictions'] = predictions

In [18]:
test_labels.head()

Unnamed: 0.1,Unnamed: 0,filename,caption,Predictions
0,0,test_1.jpg,A large building with bars on the windows in f...,a red and white bus is driving down a busy cit...
1,1,test_2.jpg,A person is skiing through the snow. There is ...,a man is skiing down a hill. he is wearing a b...
2,2,test_3.jpg,There is a bed in a room against a wall. There...,a bed is in a room. there is a window on the w...
3,3,test_4.jpg,A black and red train is on the tracks and has...,a train is on the tracks. the train is red and...
4,4,test_5.jpg,A white and yellow public transportation bus w...,a bus is parked on the side of a street. the b...


In [19]:
!pip install https://files.pythonhosted.org/packages/1f/56/a81022436c08b9405a5247b71635394d44fe7e1dbedc4b28c740e09c2840/rouge_score-0.0.4-py2.py3-none-any.whl

Collecting rouge-score==0.0.4
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [20]:
reference_captions = test_labels['caption'].tolist()
generated_captions = test_labels['Predictions'].tolist()
test_imgs = test_labels['filename'].tolist()

## Evaluating metrics

In [21]:
from rouge_score import rouge_scorer

# reference_captions = test_captions.tolist()
rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

rougeL_precision = []
rougeL_recall = []
rougeL_f1 = []
for reference, prediction in zip(reference_captions, generated_captions):
    # Convert reference and prediction to strings if they are not already
    reference = str(reference)
    prediction = str(prediction)

    scores = rouge.score(reference, prediction)
    rougeL_scores = scores['rougeL']

    rougeL_precision.append(rougeL_scores.precision)
    rougeL_recall.append(rougeL_scores.recall)
    rougeL_f1.append(rougeL_scores.fmeasure)

average_rougeL_precision = sum(rougeL_precision) / len(rougeL_precision)
average_rougeL_recall = sum(rougeL_recall) / len(rougeL_recall)
average_rougeL_f1 = sum(rougeL_f1) / len(rougeL_f1)

print("Average Rouge-L Precision:", average_rougeL_precision)
print("Average Rouge-L Recall:", average_rougeL_recall)
print("Average Rouge-L F1-score:", average_rougeL_f1)

Average Rouge-L Precision: 0.34221915579261264
Average Rouge-L Recall: 0.26280162830809567
Average Rouge-L F1-score: 0.27451505099608364


In [22]:
!pip install pycocoevalcap
!pip install nltk



In [24]:
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

def evaluate(model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    # Initialize evaluation metrics
    cider_scorer = Cider()
    spice_scorer = Spice()

    # Switch to evaluation mode
    model.eval()

    references = {}
    hypotheses = {}

    with torch.no_grad():
        for i in range(len(test_imgs)):
            img_id = test_imgs[i]
            # Convert token IDs to words using Vocabulary

            if img_id not in references:
                references[img_id] = []
                hypotheses[img_id] = []
            references[img_id].append(reference_captions[i])
            hypotheses[img_id].append(generated_captions[i])


    # Compute evaluation scores
    cider_score, _ = cider_scorer.compute_score(references, hypotheses)
    spice_score, _ = spice_scorer.compute_score(references, hypotheses)

    return cider_score, spice_score

cider_score, spice_score = evaluate(model)
print("CIDEr Score:", cider_score)
print("SPICE Score:", spice_score)

CIDEr Score: 0.10201522379203559
SPICE Score: 0.13907834708641043


### Inference on test data

In [28]:
for i in range(1,50):
  print("For Image :", test_imgs[i])
  print("ACTUAL CAPTION: ", reference_captions[i])
  print("GENERATED: ", generated_captions[i])
  print('\n')


For Image : test_2.jpg
ACTUAL CAPTION:  A person is skiing through the snow. There is loose snow all around them from him jumping. The person is wearing a yellow snow suit. The person is holding two ski poles in their hands. 
GENERATED:  a man is skiing down a hill. he is wearing a black jacket and black pants. the man is holding a ski pole in his hands. the man is wearing a black helmet. the snow is white. the snow is white. the man is wearing a black jacket. the man is wearing a black helmet. the man is wearing a black jacket. the man is wearing a black jacket.


For Image : test_3.jpg
ACTUAL CAPTION:  There is a bed in a room against a wall. There is a brown blanket on top of the bed. There is a small brown book shelf next to the bed. There is a picture hanging on the wall above the shelf. 
GENERATED:  a bed is in a room. there is a window on the wall. there is a window next to the bed. there is a window on the window. there is a white door on the window. there is a white door on th