In [1]:
import io
import os
from PIL import Image
import torch
from torchvision import transforms
from datasets import load_dataset, Dataset

def transform_format(example):
    return {
        "id": example["id"],
        "image": example["image"],
        "Question": example["conversations"][0]["value"],
        "Answer": example["conversations"][1]["value"],
    }

def load_images_as_pil(dataset, image_dir):
    updated_entries = []
    for example in dataset:
        image_path = os.path.join(image_dir, example["image"])
        try:
            pil_image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            pil_image = None
        updated_entries.append({
            "image": pil_image,
            "Question": example["Question"],
            "Answer": example["Answer"],
        })
    return updated_entries

def pil_image_to_bytes(image):
    buf = io.BytesIO()
    image.save(buf, format='JPEG')
    return buf.getvalue()

def bytes_to_pil_image(byte_data):
    return Image.open(io.BytesIO(byte_data))

# Load datasets
validate_json_path = "./Inference data comprised 40% of the dataset.json"
dataset = load_dataset("json", data_files={"train": validate_json_path})
dataset = dataset.map(transform_format)["train"]

# Load images
image_dir = "Demoface"
dataset = load_images_as_pil(dataset, image_dir)

data_dict = {
    "image": [pil_image_to_bytes(item["image"]) for item in dataset],
    "Question": [item["Question"] for item in dataset],
    "Answer": [item["Answer"] for item in dataset],
}

dataset = Dataset.from_dict(data_dict)
print(dataset[0])


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
import torch
import io
import random
from PIL import Image
from transformers import PaliGemmaForConditionalGeneration, AutoProcessor


# Load the fine-tuned model and processor
model_dir = "./Paligemma_fine_tuned_75" 
model = PaliGemmaForConditionalGeneration.from_pretrained(model_dir)
processor = AutoProcessor.from_pretrained(model_dir)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate predictions
def generate_prediction(model, processor, image, question):
    # Prepare the input prompt in the correct format
    text = f"<image> <bos> answer {question}"

    # Tokenize input and move the tokens to the correct device
    inputs = processor(text=text, images=image, return_tensors="pt").to(device)

    # Ensure that both the model and inputs are on the same device
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,  # Limits answer length
            do_sample=True,      # Enables randomness for diverse responses
            temperature=1,      # Controls randomness (higher = more diverse)
            top_p=1,            # Top-p sampling for better responses
        )
    prediction = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    prediction = prediction.split("answer")[-1].strip()

    return prediction

# Select 5 random samples for testing
random_indices = random.sample(range(len(dataset)), 20)
random_samples = [dataset[i] for i in random_indices]

for sample in random_samples:
    if isinstance(sample["image"], bytes):  # If image is in byte format
        image = bytes_to_pil_image(sample["image"]).convert("RGB")
    else:  # If image is a file path
        image = Image.open(sample["image"]).convert("RGB")

    question = sample["Question"]
    ground_truth = sample["Answer"]

    # Generate prediction
    prediction = generate_prediction(model, processor, image, question)

    # Display results
    print(f"🔹 Question: {question}")
    print(f"✅ Ground Truth: {ground_truth}")
    print(f"🎯 Prediction: {prediction.split(".\n")[1]}")
    print("-" * 60)

Loading checkpoint shards: 100%|██████████████████████████████████| 3/3 [00:00<00:00,  9.18it/s]


🔹 Question: Identify the person in the image and describe the appearance.
✅ Ground Truth: Person MEMAM_6, An Middle Eastern Middle-Aged Male
🎯 Prediction: Person MEMAM_1, an Middle Eastern Middle-Aged Male
------------------------------------------------------------
🔹 Question: Identify the person in the image and describe the appearance.
✅ Ground Truth: Person SEAYGM_3, An Southeast Asian Young Male
🎯 Prediction: Person SEAYGM_10, An Southeast Asian Young Male
------------------------------------------------------------
🔹 Question: Identify the person in the image and describe the appearance.
✅ Ground Truth: Person HPSRF_4, An Hispanic Senior Female
🎯 Prediction: Person CASRF_11, An Caucasian Senior Female
------------------------------------------------------------
🔹 Question: Identify the person in the image and describe the appearance.
✅ Ground Truth: Person MEMAF_9, An Middle Eastern Middle-Aged Female
🎯 Prediction: Person CAMAF_9, An Caucasian Middle-Aged Female
-----------------

In [None]:
import csv

# Define prediction CSV file name
csv_filename =  "Paligemma_predictions_75.csv"
i=0
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Ground Truth","Question with Ground Truth",  "Prediction"])  # Write header
    for sample in dataset:
        # Load image
        if isinstance(sample["image"], bytes):  # If image is in byte format
            image = bytes_to_pil_image(sample["image"]).convert("RGB")
        else:  # If image is a file path
            image = Image.open(sample["image"]).convert("RGB")

        question = sample["Question"]
        ground_truth = sample["Answer"]

        prediction = generate_prediction(model, processor, image, question)
        i=i+1
        writer.writerow([ground_truth, prediction,prediction.split(".\n")[1]])

print(f"✅ Predictions saved to {csv_filename}")
print(i)