### **Loading ImageCaptioningModel**

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image, ImageDraw
from tqdm import tqdm
import evaluate
import random
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    ViTModel,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    ViTImageProcessor,
    AutoProcessor, 
    AutoModelForVision2Seq,    
)

In [16]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [17]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, vit_name='WinKawaks/vit-small-patch16-224', gpt2_name='gpt2'):
        super().__init__()
        
        # ViT encoder (drop classification head)
        self.vit_processor = ViTImageProcessor.from_pretrained(vit_name)
        self.vit_encoder = ViTModel.from_pretrained(vit_name)
        
        # GPT-2 decoder with cross-attention enabled
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # necessary for padding
        self.gpt2_decoder = GPT2LMHeadModel.from_pretrained(gpt2_name, add_cross_attention=True)
        
        # Linear projector from ViT hidden size -> GPT-2 hidden size
        self.vit_hidden_size = self.vit_encoder.config.hidden_size
        self.gpt2_hidden_size = self.gpt2_decoder.config.hidden_size
        self.projector = nn.Linear(self.vit_hidden_size, self.gpt2_hidden_size)

    def forward(self, images, captions):
        # Step 1: Encode image with ViT
        vit_inputs = self.vit_processor(images=images, return_tensors="pt").to(self.vit_encoder.device)
        vit_outputs = self.vit_encoder(**vit_inputs, output_hidden_states=False)
        patch_embeddings = vit_outputs.last_hidden_state[:, 1:, :]  # remove CLS token

        # Step 2: Project patch embeddings to GPT2 hidden size
        projected_patches = self.projector(patch_embeddings)

        # Step 3: Tokenize captions
        caption_inputs = self.tokenizer(captions, return_tensors="pt", padding=True, truncation=True)
        input_ids = caption_inputs["input_ids"].to(self.gpt2_decoder.device)
        attention_mask = caption_inputs["attention_mask"].to(self.gpt2_decoder.device)

        # Step 4: Create attention mask for encoder (image patches)
        encoder_attention_mask = torch.ones(projected_patches.shape[:2], dtype=torch.long).to(self.gpt2_decoder.device)

        # Step 5: Decode with GPT2
        outputs = self.gpt2_decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=projected_patches,
            encoder_attention_mask=encoder_attention_mask,
            labels=input_ids  # for training loss
        )

        return outputs  # contains loss and logits


In [18]:
def load_model(model, load_path):
    model = torch.load(load_path)
    model.eval()  # Set to evaluation mode
    print(f"Model loaded from {load_path}")

In [21]:

model = ImageCaptioningModel().to(DEVICE)

Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', '

In [None]:
# Later, reload the model
model.load_state_dict(torch.load("/home/lovish/ImageCaptioning/Model/vit_gpt2_captioning_model.pth"))

<All keys matched successfully>

In [24]:
# Custom Dataset
class ImageCaptionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, caption = self.data[idx]
        image = image.convert("RGB")  
        return image, caption

# Collate function for batching
def collate_fn(batch):
    images, captions = zip(*batch)
    return list(images), list(captions)

In [25]:
def generate_caption_from_image(img, model, max_length=50):
    model.eval()
    device = next(model.parameters()).device

    # Convert image to RGB (in case it's not)
    img = img.convert("RGB")

    # Step 1: Preprocess and encode image with ViT
    vit_inputs = model.vit_processor(images=[img], return_tensors="pt").to(device)
    with torch.no_grad():
        vit_outputs = model.vit_encoder(**vit_inputs)
    patch_embeddings = vit_outputs.last_hidden_state[:, 1:, :]  # remove CLS
    projected_patches = model.projector(patch_embeddings)

    # Step 2: Generate caption from GPT-2
    input_ids = torch.tensor([[model.tokenizer.eos_token_id]], device=device)  # Start token
    encoder_attention_mask = torch.ones(projected_patches.shape[:-1], dtype=torch.long).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model.gpt2_decoder(
                input_ids=input_ids,
                encoder_hidden_states=projected_patches,
                encoder_attention_mask=encoder_attention_mask,
            )
            next_token_logits = outputs.logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1)

            # Append predicted token
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

            # Stop if EOS token is produced
            if next_token.item() == model.tokenizer.eos_token_id:
                break

    # Decode to string, skipping the initial start token
    caption = model.tokenizer.decode(input_ids[0][1:], skip_special_tokens=True)
    return caption

In [26]:
# Load SmolVLM model and processor once
smolvlm_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
smolvlm_model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"
).to(DEVICE)

smolvlm_messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "give a detailed caption for this image"}
        ]
    },
]

In [27]:
def generate_caption(model_name, image, custom_model=None):
    image = image.convert("RGB")
    
    if model_name.lower() == "smolvlm":
        prompt = smolvlm_processor.apply_chat_template(smolvlm_messages, add_generation_prompt=True)
        inputs = smolvlm_processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            generated_ids = smolvlm_model.generate(**inputs, max_new_tokens=50)
        generated_text = smolvlm_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return generated_text.split("Assistant:")[-1].strip()
    
    elif model_name.lower() == "vit-gpt2" and custom_model is not None:
        return generate_caption_from_image(image, custom_model)
    
    else:
        raise ValueError(f"Unsupported model: {model_name}")


In [29]:
# For SmolVLM
caption = generate_caption("smolvlm", Image.open("/home/lovish/ImageCaptioning/custom_captions_dataset/test/test_1.jpg"))
print(caption)

# For ViT-GPT2
caption = generate_caption("vit-gpt2", Image.open("/home/lovish/ImageCaptioning/custom_captions_dataset/test/test_1.jpg"), custom_model=model)
print(caption)


A large, modern airport terminal building with a flat roof and a large glass facade. The building is made of light-colored stone and has a large, flat roof. The front of the building is open to the sky, and there are several large
 photo shows a truck and motorcycle parked in front of a yellow building. The truck has a yellow stripe on the front of it. There is a car behind the truck. There is a yellow stripe on the street. There is a white building behind the


In [30]:
def calculate_scores(hypothesis,references):
    rouge = evaluate.load("rouge")
    rouge_score = rouge.compute(predictions=hypothesis,references=references)
    bleu = evaluate.load("bleu")
    BLEU_score = bleu.compute(predictions=hypothesis, references=references)
    meteor = evaluate.load("meteor")
    meteor_score = meteor.compute(predictions=hypothesis,references=references)

    return BLEU_score['bleu'], meteor_score['meteor'], rouge_score['rougeL']

In [31]:
def occlude_image(img, occlusion_percent):
    img = img.convert("RGB").copy()
    w, h = img.size
    total_pixels = w * h
    num_pixels_to_occlude = int(total_pixels * occlusion_percent)

    draw = ImageDraw.Draw(img)

    while num_pixels_to_occlude > 0:
        # Random block size and position
        block_w = random.randint(10, 40)
        block_h = random.randint(10, 40)
        x = random.randint(0, w - block_w)
        y = random.randint(0, h - block_h)

        block_area = block_w * block_h
        if block_area > num_pixels_to_occlude:
            block_h = max(1, num_pixels_to_occlude // block_w)
            block_area = block_w * block_h

        draw.rectangle([x, y, x + block_w, y + block_h], fill=(0, 0, 0))
        num_pixels_to_occlude -= block_area

    return img

In [32]:


def evaluate_on_occluded_image(data_loader, occlusion_levels, custom_model, device):
    records = {
        "caption": [],
        "reference": [],
        "occlusion": [],
        "model": []
    }

    evaluation_scores = defaultdict(dict)

    for occ in occlusion_levels:
        for images, captions in tqdm(data_loader, desc=f"Evaluating occlusion level {occ}"):
            for img, ref_caption in zip(images, captions):
                occluded_img = occlude_image(img, occ)

                # Evaluate with custom model
                try:
                    pred_custom = generate_caption("vit-gpt2", occluded_img, custom_model=custom_model)
                except Exception as e:
                    pred_custom = f"[ERROR: {e}]"

                records["caption"].append(pred_custom)
                records["reference"].append(ref_caption)
                records["occlusion"].append(occ)
                records["model"].append("vit-gpt2")

                # Evaluate with smolvlm
                try:
                    pred_smol = generate_caption("smolvlm", occluded_img)
                except Exception as e:
                    pred_smol = f"[ERROR: {e}]"

                records["caption"].append(pred_smol)
                records["reference"].append(ref_caption)
                records["occlusion"].append(occ)
                records["model"].append("smolvlm")

        # Compute scores for each model at current occlusion level
        for model_name in ["vit-gpt2", "smolvlm"]:
            preds = [c for c, m, o in zip(records["caption"], records["model"], records["occlusion"]) if m == model_name and o == occ]
            refs  = [r for r, m, o in zip(records["reference"], records["model"], records["occlusion"]) if m == model_name and o == occ]

            try:
                bleu, meteor, rougeL = calculate_scores(preds, refs)
                evaluation_scores[model_name][occ] = {
                    "BLEU": bleu,
                    "METEOR": meteor,
                    "ROUGE-L": rougeL
                }
            except Exception as e:
                evaluation_scores[model_name][occ] = {
                    "BLEU": None,
                    "METEOR": None,
                    "ROUGE-L": None,
                    "Error": str(e)
                }

    return {
        "records": records,
        "scores": evaluation_scores
    }


In [33]:
def creatDataSet(csv_file_path,image_folderpath):
    df=pd.read_csv(csv_file_path)
    data=[]
    for _ , row in tqdm(df.iterrows(), total=df.shape[0]):
        img=Image.open(f'{image_folderpath}/{row["filename"]}')
        gt_caption=row['caption']
        data.append([img,gt_caption])
    
    return data

In [34]:
test_csv_path="custom_captions_dataset/test.csv"
test_image_path="custom_captions_dataset/test"
test_data=creatDataSet(test_csv_path,test_image_path)

test_data = ImageCaptionDataset(test_data)
test_loader = DataLoader(test_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

100%|██████████| 928/928 [00:00<00:00, 4518.90it/s]


In [37]:
# After running evaluate_on_occluded_image(...)
result = evaluate_on_occluded_image(test_loader, [0.0, 0.1, 0.5, 0.8], model, DEVICE)

# Create a DataFrame from records
df = pd.DataFrame(result["records"])
df.to_csv('occluded_image_captions.csv')
print(df.head())

# Evaluation scores
print(result["scores"])

Evaluating occlusion level 0.0: 100%|██████████| 116/116 [42:42<00:00, 22.09s/it]
[nltk_data] Downloading package wordnet to /home/lovish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lovish/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lovish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lovish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lovish/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lovish/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Evaluating occlusion level 0.1: 100%|██████████| 116/116 [40:51<00:00, 21.14s/it]
[nltk_data] Downloading package wordnet to /home/lovish/nltk_data...
[nltk_data]   Package wordn

                                             caption  \
0   image is of a bird eating a grass-fed chicken...   
1  Two birds are standing on a paved path. The bi...   
2  planes are flying in the sky. The sky is very ...   
3  Four blue jets are flying in formation. The je...   
4   bedroom has a large bed in it. The bed has wh...   

                                           reference  occlusion     model  
0  Two birds stand next to each other.  The birds...        0.0  vit-gpt2  
1  Two birds stand next to each other.  The birds...        0.0   smolvlm  
2  There are four jets flying in the sky together...        0.0  vit-gpt2  
3  There are four jets flying in the sky together...        0.0   smolvlm  
4  This is a hotel room. There is a patterned com...        0.0  vit-gpt2  
defaultdict(<class 'dict'>, {'vit-gpt2': {0.0: {'BLEU': 0.06731295454548539, 'METEOR': np.float64(0.24733249028518053), 'ROUGE-L': np.float64(0.27065165815107983)}, 0.1: {'BLEU': 0.06234001328728869, 'METEOR

In [38]:
print(result["scores"])

defaultdict(<class 'dict'>, {'vit-gpt2': {0.0: {'BLEU': 0.06731295454548539, 'METEOR': np.float64(0.24733249028518053), 'ROUGE-L': np.float64(0.27065165815107983)}, 0.1: {'BLEU': 0.06234001328728869, 'METEOR': np.float64(0.24059009158816858), 'ROUGE-L': np.float64(0.26280973035257105)}, 0.5: {'BLEU': 0.05303069021522836, 'METEOR': np.float64(0.2192008985726052), 'ROUGE-L': np.float64(0.24259449068736796)}, 0.8: {'BLEU': 0.04395797020110086, 'METEOR': np.float64(0.20607194433245668), 'ROUGE-L': np.float64(0.22467066258485577)}}, 'smolvlm': {0.0: {'BLEU': 0.06339899968180299, 'METEOR': np.float64(0.23436718301270482), 'ROUGE-L': np.float64(0.2742979380390852)}, 0.1: {'BLEU': 0.06041942151346876, 'METEOR': np.float64(0.2307815531861651), 'ROUGE-L': np.float64(0.2724314592719359)}, 0.5: {'BLEU': 0.04050770229908609, 'METEOR': np.float64(0.19682980618717053), 'ROUGE-L': np.float64(0.24458888476740562)}, 0.8: {'BLEU': 0.025632136966549667, 'METEOR': np.float64(0.1600829049977319), 'ROUGE-L':