In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
from pathlib import Path
import random
from collections import defaultdict
import matplotlib.pyplot as plt
from torchvision.transforms.functional import to_pil_image
import os

class Flickr8kDataset(Dataset):
    def __init__(self, captions_file, img_dir, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.img_captions = defaultdict(list)  # Use defaultdict to automatically handle lists of captions

        # Read the .txt file
        with open(captions_file, 'r') as file:
            for line in file:
                parts = line.strip().split(',', 1)  # Split on the first comma
                if len(parts) == 2:
                    img_name, caption = parts
                    self.img_captions[img_name].append(caption)

        # Convert img_captions keys to a list to index images
        self.img_names = list(self.img_captions.keys())
        print(self.img_captions.items())

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        img_name = self.img_names[idx]
        captions = self.img_captions[img_name]  # Get all captions for the image
        print(captions)
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, captions


# Image transformations
transform = Compose([
    Resize((384, 384)),
    ToTensor(),
])


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
import json


# Image transformations
transform = Compose([
    Resize((384, 384)),
    ToTensor(),
])

# Initialize dataset and DataLoader
dataset = Flickr8kDataset('/content/drive/My Drive/MasterThesis/flickr8k_dataset/captions.txt', '/content/drive/My Drive/MasterThesis/flickr8k_dataset/Images', transform)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)

# Load the BLIP model and processor
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Initialize the GPT-2-based image-to-text pipeline
gpt2_caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

# Initialize the Pix2Struct image-to-text pipeline
pix2struct_caption_pipeline = pipeline("image-to-text", model="google/pix2struct-textcaps-base")

# Function to generate captions using BLIP
def generate_caption_with_blip(image_pil, processor, model):
    inputs = processor(images=image_pil, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128, num_beams=4, return_dict_in_generate=True)
    caption = processor.decode(outputs.sequences[0], skip_special_tokens=True)
    return caption

# Function to generate captions using the GPT-2-based pipeline
def generate_caption_with_gpt2(image_pil):
    result = gpt2_caption_pipeline(image_pil)
    generated_caption = result[0]['generated_text'] if result else "Caption generation failed"
    return generated_caption

# Function to generate captions using the Pix2Struct pipeline
def generate_caption_with_pix2struct(image_pil):
    result = pix2struct_caption_pipeline(image_pil)
    generated_caption = result[0]['generated_text'] if result else "Caption generation failed"
    return generated_caption

# Create a dictionary to store generated and original captions
captions_dict = {}

for idx, (images, captions_batch) in enumerate(data_loader):
    if idx < 1000:  # Skip the first 1000 images
        continue
    elif idx >= 1100:  # Stop after processing up to the 1100th image
        break

    # Convert the tensor to PIL Image; remove the batch dimension
    image_pil = to_pil_image(images.squeeze(0))

    # Generate captions from all three models
    blip_caption = generate_caption_with_blip(image_pil, blip_processor, blip_model)
    gpt2_caption = generate_caption_with_gpt2(image_pil)
    pix2struct_caption = ""

    # Assuming captions_batch is a list of captions for the current image
    original_captions = captions_batch  # This will depend on your dataset structure

    print(f"Image {idx + 1}:")
    print(f"BLIP Caption: {blip_caption}")
    print(f"GPT-2 Caption: {gpt2_caption}")
    print(f"Pix2Struct Caption: {pix2struct_caption}")
    print("Original Captions:")
    for caption in original_captions:
        print(f"- {caption}")
    print("-----------------------------------------------------\n")

    # Store in the dictionary
    captions_dict[f"image_{idx}"] = {
        "blip_caption": blip_caption,
        "gpt2_caption": gpt2_caption,
        "pix2struct_caption": pix2struct_caption,
        "original_captions": original_captions
    }
    print()

# Save the dictionary to a JSON file
with open('/content/drive/My Drive/MasterThesis/flickr8k_dataset/captions_data_flickr_1100.json', 'w') as f:
    json.dump(captions_dict, f, indent=4)


Output hidden; open in https://colab.research.google.com to view.

In [None]:
import json
import os

def merge_json_files(start_suffix, end_suffix, base_directory, base_filename, output_filename):
    combined_dict = {}
    for i in range(start_suffix, end_suffix + 100, 100):  # Assuming the step is always 100
        file_path = os.path.join(base_directory, f"{base_filename}_{i}.json")
        with open(file_path, 'r') as f:
            data = json.load(f)
            combined_dict.update(data)

    output_file_path = os.path.join(base_directory, output_filename)
    with open(output_file_path, 'w') as f:
        json.dump(combined_dict, f, indent=4)


base_directory = '/content/drive/My Drive/MasterThesis/flickr8k_dataset'
base_filename = 'captions_data_flickr'
output_filename = 'combined_captions_data_flickr.json'
merge_json_files(100, 1100, base_directory, base_filename, output_filename)


In [None]:
import json

# Step 1: Read the Merged JSON File
merged_file_path = '/content/drive/My Drive/MasterThesis/flickr8k_dataset/combined_captions_data_flickr.json'
with open(merged_file_path, 'r') as file:
    merged_data = json.load(file)

# Step 2: Transform the Data
transformed_data = {}
for idx, (key, value) in enumerate(merged_data.items()):
    # Creating a list of captions, where each caption is a string (enclosed in double quotes is handled by JSON)
    original_captions_list = [caption[0] for caption in value.get("original_captions", [])]

    transformed_entry = {
        "image_id": value.get("image_id", idx),
        "generated_captions": {
            "blip": value.get("blip_caption", ""),
            "gpt2": value.get("gpt2_caption", ""),
            "pix2struct": value.get("pix2struct_caption", "")
        },
        "original_coco_captions": original_captions_list
    }
    transformed_data[str(idx)] = transformed_entry

# Step 3: Write the Transformed Data to a New JSON File
transformed_file_path = '/content/drive/My Drive/MasterThesis/flickr8k_dataset/combined_captions_data_flickr_final.json'
with open(transformed_file_path, 'w') as file:
    json.dump(transformed_data, file, indent=4)
