# download the dataset

In [1]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/aladdinpersson/flickr8kimagescaptions")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: titus-tanashi
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/aladdinpersson/flickr8kimagescaptions


#take random 10% of dataset

In [28]:
captions_path = "/content/flickr8kimagescaptions/flickr8k/captions.txt"
image_folder = "/content/flickr8kimagescaptions/flickr8k/images"

In [29]:
from collections import defaultdict
import random
image_captions = defaultdict(list)

with open(captions_path, "r", encoding="utf-8") as f:
    next(f)  # Skip header if needed
    for line in f:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        parts = line.split(",", 1)  # Split at first comma to avoid errors
        if len(parts) < 2:
            print(f"Skipping malformed line: {line}")  # Debugging info
            continue  # Skip invalid lines
        img_name, caption = parts
        image_captions[img_name].append(caption.lower().split())
sample_images = random.sample(list(image_captions.keys()), int(len(image_captions) * 0.1))

#call the model

In [20]:
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
model_path = "/content/model"  # Path to your trained model
processor_path = "/content/processor"  # Path to your processor

model = AutoModelForVision2Seq.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(processor_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=2304, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=2304, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
              (lora_magnitude

#evaluate model

In [24]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c373960d78c59716f174b9ec41741a4b878c0e8632b3a07d0da25a99931d740a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [25]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from PIL import Image
import os

In [26]:
bleu_scores = {1: [], 2: [], 3: [], 4: []}
rouge_l_scores = []

In [30]:
for img_name in sample_images:
    img_path = os.path.join(image_folder, img_name)
    reference_captions = image_captions[img_name]

    # Load and preprocess image
    image = Image.open(img_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Generate Caption
    with torch.no_grad():
        output_ids = model.generate(**inputs)
    generated_caption = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    generated_tokens = generated_caption.lower().split()

    # Compute BLEU Scores
    for n in range(1, 5):
        weights = [1/n] * n + [0] * (4 - n)  # Adjust weights
        bleu_scores[n].append(sentence_bleu(reference_captions, generated_tokens, weights=weights))

    # Compute ROUGE-L Score
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    score = scorer.score(" ".join(reference_captions[0]), generated_caption)
    rouge_l_scores.append(score["rougeL"].fmeasure)

# Compute Average Scores
avg_bleu = {f"BLEU-{k}": sum(v)/len(v) for k, v in bleu_scores.items()}
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

# Print Results
print("Evaluation Results on 10% Flickr8k:")
print(avg_bleu)
print(f"ROUGE-L: {avg_rouge_l:.4f}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Evaluation Results on 10% Flickr8k:
{'BLEU-1': 0.6576936566836769, 'BLEU-2': 0.489008008637832, 'BLEU-3': 0.3327776390740428, 'BLEU-4': 0.20073460277328123}
ROUGE-L: 0.4479
