# Evaluate BLIP model offline

Let’s start by loading our trained model and our test data.

In [5]:
!pip install torch torchvision pandas numpy matplotlib seaborn pillow torchsummary scikit-learn tritonclient[all] nltk

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tritonclient[all]
  Downloading tritonclient-2.56.0-py3-none-manylinux1_x86_64.whl.metadata (2.8 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting python-rapidjson>=0.9.1 (from tritonclient[all])
  Downloading python_rapidjson-1.20-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting cuda-python (from tritonclient[all])
  Downloading cuda_python-12.9.0-py3-none-any.whl.metadata (4.6 kB)
Collecting geventhttpclient>=2.3.3 (from tritonclient[all])
  Downloading geventhttpclient-2.3.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting grpcio<1.68,>=1.63.0 (from tritonclient[all])
  Downloading grpcio-1.67.1-cp312-cp312

In [1]:
#imports
import os
import base64
import torch
import pandas as pd
import numpy as np
import random
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import InterpolationMode
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import StratifiedKFold
import tritonclient.http as httpclient
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json

In [2]:
print(os.listdir('/mnt/data/Flickr30k'))

['flickr30k_test.json', 'flickr30k-images', 'flickr30k_test_gt.json', 'flickr30k_train.json', 'flickr30k_val.json', 'flickr30k_val_gt.json', 'flickr30k_test_online_gt.json', 'flickr30k_test_online.json']


In [3]:
json_file = pd.read_csv("/mnt/data/Flickr30k/flickr30k_test.json")
img_dir="/mnt/data/Flickr30k/flickr30k-images"

In [4]:
# Initialize client
client = httpclient.InferenceServerClient(url="129.114.109.59:8110")

def get_caption(image_path):
    # Read image
    with open(image_path, "rb") as f:
        image_bytes = f.read() 

    inputs = []
    inputs.append(httpclient.InferInput("INPUT_IMAGE", [1, 1], "BYTES"))

    encoded_str =  base64.b64encode(image_bytes).decode("utf-8")
    input_data = np.array([[encoded_str]], dtype=object)
    inputs[0].set_data_from_numpy(input_data)

    outputs = []
    outputs.append(httpclient.InferRequestedOutput("CAPTION", binary_data=False))
    results = client.infer(model_name="caption", inputs=inputs, outputs=outputs)
    cap = results.as_numpy("CAPTION")

    return cap

In [5]:
def generate_all_captions(data):
    results = []
    for sample in data:
        image_path = sample['image_path']
        expected = sample['expected_caption']
        generated = get_caption(image_path)
        results.append({
            'image_path': image_path,
            'expected_caption': expected,
            'generated_caption': generated
        })
    return results

In [6]:
# Load your JSON
with open('/mnt/data/Flickr30k/flickr30k_test.json') as f:
    dataset = json.load(f)

# Build your list in the format your function expects
data = []
for entry in dataset:
    img_name = entry['image']
    img_path = os.path.join('/mnt/data/Flickr30k', img_name)
    gt_captions = entry['caption']  # list of 5 captions
    
    try:
        # Check if image exists
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found: {img_path}")
        
        # If image exists, append the data
        data.append({'image_path': img_path, 'expected_caption': gt_captions})
        # print("appended")
    
    except FileNotFoundError as e:
        print(f"Skipping: {e}")
        continue  # Skip this image and move to the next one

results = generate_all_captions(data)

In [7]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def calculate_corpus_bleu(results):
    generated_texts = []
    reference_texts = []
    smoothing = SmoothingFunction().method4  # Apply smoothing (method4 is commonly used)

    for result in results:
        generated = result['generated_caption']
        expected = result['expected_caption']
        
        # Ensure generated text is a string (handle numpy arrays if needed)
        if isinstance(generated, np.ndarray):
            generated = generated.item()

        # Prepare references as a list of tokenized captions
        reference = [caption.split() for caption in expected]
        candidate = generated.split()  # Tokenize the generated caption
        
        # Append to list for corpus BLEU calculation
        reference_texts.append(reference)
        generated_texts.append(candidate)

    # Calculate BLEU score for the entire corpus
    bleu_score = corpus_bleu(reference_texts, generated_texts, smoothing_function=smoothing)
    
    return bleu_score


In [8]:
calculate_corpus_bleu(results)

0.18343450809603576

# Template based testing

In [9]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

captions = [
    "A black and white dog is running in a grassy garden surrounded by a white fence.",
    "A Boston Terrier is running on lush green grass in front of a white fence.",
    "A black and white dog is running through the grass.",
    "A dog runs on the green grass near a wooden fence.",
    "A Boston terrier is running in the grass."
]

def compute_bleu(reference_caption, generated_caption):
    reference_caption = str(reference_caption)
    generated_caption = str(generated_caption)
    reference = [reference_caption.split()]
    candidate = generated_caption.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

def test_caption_synonym_stability(image_path):
    print("Testing synonym stability ...")
    reference_caption = get_caption(image_path)
    print(reference_caption)
    for i, test_caption in enumerate(captions[0:], start=1):
        bleu_score = compute_bleu(reference_caption, test_caption)
        print(f"Caption {i} BLEU: {bleu_score:.3f}")
        # assert bleu_score > 0.5, f"Expected high BLEU for synonym caption {i}, but got {bleu_score:.3f}"

def test_caption_meaning_change(image_path):
    print("Testing meaning change ...")
    reference_caption = get_caption(image_path)
    print(reference_caption)
    changed_caption = "A cat is sleeping on a couch."
    bleu_score = compute_bleu(reference_caption, changed_caption)
    print(f"Meaning change BLEU: {bleu_score:.3f}")
    # assert bleu_score < 0.3, f"Expected low BLEU for meaning change, but got {bleu_score:.3f}"

# Run tests
test_caption_synonym_stability("dog_running_park.jpg")
test_caption_meaning_change("dog_running_park.jpg")

Testing synonym stability ...
['a brown dog running across a lush green field']
Caption 1 BLEU: 0.021
Caption 2 BLEU: 0.046
Caption 3 BLEU: 0.028
Caption 4 BLEU: 0.028
Caption 5 BLEU: 0.025
Testing meaning change ...
['a brown dog running across a lush green field']
Meaning change BLEU: 0.024
