### **IMPORTs**

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
from tqdm import tqdm
import evaluate

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    ViTModel,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    ViTImageProcessor,
    AutoProcessor, 
    AutoModelForVision2Seq,    
)

In [64]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


### **LOAD DATASETs**

In [67]:
def creatDataSet(csv_file_path,image_folderpath):
    df=pd.read_csv(csv_file_path)
    data=[]
    for _ , row in tqdm(df.iterrows(), total=df.shape[0]):
        img=Image.open(f'{image_folderpath}/{row["filename"]}')
        gt_caption=row['caption']
        data.append([img,gt_caption])
    
    return data

In [68]:
train_csv_path="/home/lovish/ImageCaptioning/custom_captions_dataset/train.csv"
train_image_path="custom_captions_dataset/train"
train_data=creatDataSet(train_csv_path,train_image_path)

  0%|          | 0/5715 [00:00<?, ?it/s]

100%|██████████| 5715/5715 [00:00<00:00, 5999.20it/s]


In [69]:
val_csv_path="custom_captions_dataset/val.csv"
val_image_path="custom_captions_dataset/val"
val_data=creatDataSet(val_csv_path,val_image_path)

100%|██████████| 946/946 [00:00<00:00, 5923.73it/s]


In [71]:
test_csv_path="custom_captions_dataset/test.csv"
test_image_path="custom_captions_dataset/test"
test_data=creatDataSet(test_csv_path,test_image_path)

100%|██████████| 928/928 [00:00<00:00, 13031.02it/s]


In [72]:
print("Training datset :",len(train_data))
print("Validation datset :",len(val_data))
print("Testing datset :",len(test_data))

Training datset : 5715
Validation datset : 946
Testing datset : 928


### **Zero Shot Caption Generation**

In [73]:
# preloading smolvlm model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
smol_model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"  # force eager mode
).to(DEVICE)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image"
            },
            {
                 "type": "text", 
                 "text": "give a detailed caption for this image"
            }
        ]
    },
]

In [74]:
def zero_shot_captioning(img_path, model_name):
    img = Image.open(img_path)
    if model_name.lower() == 'smolvlm':
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[img], return_tensors="pt")
        inputs = inputs.to(DEVICE)
        
        generated_ids = smol_model.generate(**inputs, max_new_tokens=50)

        generated_texts = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )
        return generated_texts[0].split('Assistant:')[1]
    else :
        print(f'{model_name} is currently not available in the program')
        

In [75]:
zero_shot_captioning('custom_captions_dataset/test/test_1.jpg', 'smolvlm')

' A large, modern airport terminal building with a flat roof and a large glass facade. The building is made of light-colored stone and has a large, flat roof. The front of the building is open to the sky, and there are several large'

### **Custom ImageCaptionModel Class**

In [77]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, vit_name='WinKawaks/vit-small-patch16-224', gpt2_name='gpt2'):
        super().__init__()
        
        # ViT encoder (drop classification head)
        self.vit_processor = ViTImageProcessor.from_pretrained(vit_name)
        self.vit_encoder = ViTModel.from_pretrained(vit_name)
        
        # GPT-2 decoder with cross-attention enabled
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # necessary for padding
        self.gpt2_decoder = GPT2LMHeadModel.from_pretrained(gpt2_name, add_cross_attention=True)
        
        # Linear projector from ViT hidden size -> GPT-2 hidden size
        self.vit_hidden_size = self.vit_encoder.config.hidden_size
        self.gpt2_hidden_size = self.gpt2_decoder.config.hidden_size
        self.projector = nn.Linear(self.vit_hidden_size, self.gpt2_hidden_size)

    def forward(self, images, captions):
        # Step 1: Encode image with ViT
        vit_inputs = self.vit_processor(images=images, return_tensors="pt").to(self.vit_encoder.device)
        vit_outputs = self.vit_encoder(**vit_inputs, output_hidden_states=False)
        patch_embeddings = vit_outputs.last_hidden_state[:, 1:, :]  # remove CLS token

        # Step 2: Project patch embeddings to GPT2 hidden size
        projected_patches = self.projector(patch_embeddings)

        # Step 3: Tokenize captions
        caption_inputs = self.tokenizer(captions, return_tensors="pt", padding=True, truncation=True)
        input_ids = caption_inputs["input_ids"].to(self.gpt2_decoder.device)
        attention_mask = caption_inputs["attention_mask"].to(self.gpt2_decoder.device)

        # Step 4: Create attention mask for encoder (image patches)
        encoder_attention_mask = torch.ones(projected_patches.shape[:2], dtype=torch.long).to(self.gpt2_decoder.device)

        # Step 5: Decode with GPT2
        outputs = self.gpt2_decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=projected_patches,
            encoder_attention_mask=encoder_attention_mask,
            labels=input_ids  # for training loss
        )

        return outputs  # contains loss and logits


In [78]:
# Custom Dataset
class ImageCaptionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, caption = self.data[idx]
        image = image.convert("RGB")  
        return image, caption

# Collate function for batching
def collate_fn(batch):
    images, captions = zip(*batch)
    return list(images), list(captions)

# Save model weights
def save_model(model, save_path):
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

In [79]:
def train_model(model, train_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
        for images, captions in loop:
            # Forward pass
            outputs = model(images=images, captions=captions)
            loss = outputs.loss
    
            # Backward + optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
    
        print(f"Epoch {epoch+1} average loss: {total_loss / len(train_loader):.4f}")
    save_path = "model/vit_gpt2_captioning_model.pth"
    save_model(model, save_path)
    torch.cuda.empty_cache()
    
    

In [80]:
# Initialize model
model = ImageCaptioningModel().to(DEVICE)
tokenizer = model.tokenizer

# Prepare DataLoader
train_dataset = ImageCaptionDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', '

In [None]:

train_model(model, train_loader, optimizer, DEVICE, 10)

Epoch 1/3:   0%|          | 0/715 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/3: 100%|██████████| 715/715 [04:00<00:00,  2.97it/s, loss=1.72] 


Epoch 1 average loss: 1.7176


Epoch 2/3: 100%|██████████| 715/715 [03:42<00:00,  3.21it/s, loss=1.24] 


Epoch 2 average loss: 1.5278


Epoch 3/3: 100%|██████████| 715/715 [03:42<00:00,  3.21it/s, loss=1.47] 

Epoch 3 average loss: 1.4155





**Loading saved model weights (ran 10 epochs)**

In [81]:
# Later, reload the model
model.load_state_dict(torch.load("/home/lovish/ImageCaptioning/Model/vit_gpt2_captioning_model.pth"))

<All keys matched successfully>

In [82]:
def generate_caption_from_image(img, model, max_length=50):
    model.eval()
    device = next(model.parameters()).device

    # Convert image to RGB (in case it's not)
    img = img.convert("RGB")

    # Step 1: Preprocess and encode image with ViT
    vit_inputs = model.vit_processor(images=[img], return_tensors="pt").to(device)
    with torch.no_grad():
        vit_outputs = model.vit_encoder(**vit_inputs)
    patch_embeddings = vit_outputs.last_hidden_state[:, 1:, :]  # remove CLS
    projected_patches = model.projector(patch_embeddings)

    # Step 2: Generate caption from GPT-2
    input_ids = torch.tensor([[model.tokenizer.eos_token_id]], device=device)  # Start token
    encoder_attention_mask = torch.ones(projected_patches.shape[:-1], dtype=torch.long).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model.gpt2_decoder(
                input_ids=input_ids,
                encoder_hidden_states=projected_patches,
                encoder_attention_mask=encoder_attention_mask,
            )
            next_token_logits = outputs.logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1)

            # Append predicted token
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

            # Stop if EOS token is produced
            if next_token.item() == model.tokenizer.eos_token_id:
                break

    # Decode to string, skipping the initial start token
    caption = model.tokenizer.decode(input_ids[0][1:], skip_special_tokens=True)
    return caption

In [None]:
def calculate_scores(hypothesis,references):
    
    rouge = evaluate.load("rouge")
    rouge_score = rouge.compute(predictions=hypothesis,references=references)
    bleu = evaluate.load("bleu")
    BLEU_score = bleu.compute(predictions=hypothesis, references=references)
    meteor = evaluate.load("meteor")
    meteor_score = meteor.compute(predictions=hypothesis,references=references)

    return BLEU_score['bleu'], meteor_score['meteor'], rouge_score['rougeL']

def evaluate_model(model, data_loader, device):
    model.eval()
    all_predictions = []
    all_references = []

    for images, captions in tqdm(data_loader, desc="Evaluating"):
        for img, ref_caption in zip(images, captions):
            prediction = generate_caption_from_image(img, model)
            all_predictions.append(prediction)
            all_references.append(ref_caption)

    # Compute evaluation scores
    bleu, meteor, rougeL = calculate_scores(all_predictions, all_references)

    return {
        "BLEU": bleu,
        "METEOR": meteor,
        "ROUGE-L": rougeL
    }

    

### EVALUATING SCORES

##### OUR MODEL

In [None]:
# Prepare DataLoader
test_dataset = ImageCaptionDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

results = evaluate_model(model, test_loader, DEVICE)

In [91]:
OurModelScores=results
avgBLEU_OurModel=OurModelScores["BLEU"]
avgMETEOR_OurModel=OurModelScores["METEOR"]
avgROUGE_OurModel=OurModelScores["ROUGE-L"]

print("_"*20,"OUR COUSTOM MODEL SCORES","_"*20)
print("BLEU SCORE:",avgBLEU_OurModel)
print("METEOR SCORE:",avgMETEOR_OurModel)
print("ROUGE SCORE:",avgROUGE_OurModel)

____________________ OUR COUSTOM MODEL SCORES ____________________
BLEU SCORE: 0.06731295454548539
METEOR SCORE: 0.24733249028518053
ROUGE SCORE: 0.2705377493522648


#### smolVLM

In [None]:
def evaluate_smolvlm(model, processor, data_loader, device):
    model.eval()
    all_predictions = []
    all_references = []

    for images, captions in tqdm(data_loader, desc="Evaluating SmolVLM"):
        for img, ref_caption in zip(images, captions):
            img = img.convert("RGB")  # Ensure correct format
            try:
                prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor(text=prompt, images=[img], return_tensors="pt").to(device)

                with torch.no_grad():
                    generated_ids = model.generate(**inputs, max_new_tokens=50)

                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                predicted_caption = generated_text.split("Assistant:")[-1].strip()

                all_predictions.append(predicted_caption)
                all_references.append(ref_caption)
            except Exception as e:
                print(f"Error processing image: {e}")
                continue

    # Compute evaluation scores
    bleu, meteor, rougeL = calculate_scores(all_predictions, all_references)

    return {
        "BLEU": bleu,
        "METEOR": meteor,
        "ROUGE-L": rougeL
    }

In [90]:
smolVLM_scores=evaluate_smolvlm(smol_model, processor, test_loader, DEVICE)

Evaluating SmolVLM: 100%|██████████| 116/116 [35:24<00:00, 18.32s/it]
[nltk_data] Downloading package wordnet to /home/lovish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lovish/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lovish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [94]:

avgBLEU_smolVLM=smolVLM_scores["BLEU"]
avgMETEOR_smolVLM=smolVLM_scores["METEOR"]
avgROUGE_smolVLM=smolVLM_scores["ROUGE-L"]

print("_"*20,"smallVLM MODEL SCORES","_"*20)
print("BLEU SCORE:",avgBLEU_OurModel)
print("METEOR SCORE:",avgMETEOR_smolVLM)
print("ROUGE SCORE:",avgROUGE_smolVLM)

____________________ smallVLM MODEL SCORES ____________________
BLEU SCORE: 0.06731295454548539
METEOR SCORE: 0.23436718301270482
ROUGE SCORE: 0.27432802118733357
