## Section 9 
This is the training script that was used to fine tune the new BLIP Model I chose due to some compability issues.

In [None]:
import os
import json
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments

# Dataset

class FossilCaptionDataset(Dataset):
    def __init__(self, json_path, image_folder, processor):
        self.data = json.load(open(json_path))
        self.keys = list(self.data.keys())
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        img_name = self.keys[idx]
        caption = self.data[img_name]

        img_path = os.path.join(self.image_folder, img_name)
        image = Image.open(img_path).convert("RGB")

        inputs = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }


# Paths and model

json_path = "dataset/dataset.json"
image_folder = "dataset/images/"

model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

dataset = FossilCaptionDataset(json_path, image_folder, processor)
print("Dataset size:", len(dataset))


# Training arguments
training_args = TrainingArguments(
    output_dir="blip-finetuned-fossils",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=20,
    save_steps=500,
    fp16=True,
    report_to="none",
)


# Collate function for BLIP
def collate_fn(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    input_ids = torch.stack([x["input_ids"] for x in batch])
    attention_mask = torch.stack([x["attention_mask"] for x in batch])

    # Mask padding tokens in labels
    labels = input_ids.clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

# Custom Trainer to avoid num_items_in_batch error

class BLIPTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(
            pixel_values=inputs["pixel_values"],
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            labels=labels
        )
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

trainer = BLIPTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

# Train

trainer.train()

# Save model and processor
model.save_pretrained("blip-finetuned-fossils")
processor.save_pretrained("blip-finetuned-fossils")


Dataset size: 5401


Step,Training Loss
20,6.916
40,4.9671
60,4.3923
80,3.9872
100,3.823
120,3.5259
140,3.2096
160,3.0365
180,3.1176
200,2.8282


[]