In [2]:
import os
import json
from datasets import load_dataset
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
from PIL import Image
import torch

In [4]:
# Step 1: Load JSON Captions and Custom Dataset
class ImageCaptionDataset(Dataset):
    def __init__(self, annotations_file, image_dir, processor):
        with open(annotations_file, "r") as f:
            data = json.load(f)
            self.captions = data["annotations"]  # Access the "annotations" key
        self.image_dir = image_dir
        self.processor = processor
        print(f"Loaded {len(self.captions)} captions.")  # Debug print

    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        caption_data = self.captions[idx]
        image_path = os.path.join(self.image_dir, caption_data["image_id"])
        print(f"Loading image: {image_path}")  # Debug print
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = Image.open(image_path).convert("RGB")
        caption = caption_data["caption"]

        inputs = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        inputs["labels"] = inputs.input_ids.clone()
        return inputs

In [6]:
annotations_file = r"C:\Users\sumay\Desktop\AI_Image_Tagging\data\captions.json"  # JSON
image_dir = r"C:\Users\sumay\Desktop\AI_Image_Tagging\data\train"                # images
print(f"Image directory: {image_dir}")
print(f"Directory exists: {os.path.exists(image_dir)}")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
dataset = ImageCaptionDataset(annotations_file, image_dir, processor)

# Debug print
print(f"Total dataset size: {len(dataset)}")
print(f"Sample entry: {dataset[0]}")  # Print the first entry

Image directory: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train
Directory exists: True
Loaded 852 captions.
Total dataset size: 852
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\black_charger_lenovo.JPG
Sample entry: {'pixel_values': tensor([[[[0.5143, 0.6311, 0.7625,  ..., 1.1128, 1.1274, 1.0836],
          [0.5435, 0.4705, 0.4413,  ..., 1.0398, 1.1274, 1.0398],
          [0.5435, 0.5727, 0.4851,  ..., 1.2150, 1.1128, 1.0836],
          ...,
          [0.7479, 0.7625, 0.8063,  ..., 1.0544, 0.9084, 0.9230],
          [0.6603, 0.6895, 0.7333,  ..., 0.7041, 0.7333, 0.7625],
          [0.4121, 0.3975, 0.3537,  ..., 0.7333, 0.6311, 0.6019]],

         [[0.4390, 0.5591, 0.6942,  ..., 1.0544, 1.0694, 1.0243],
          [0.4691, 0.3940, 0.3640,  ..., 0.9793, 1.0694, 0.9793],
          [0.4691, 0.4991, 0.4090,  ..., 1.1594, 1.0544, 1.0243],
          ...,
          [0.6942, 0.7092, 0.7542,  ..., 0.9193, 0.7842, 0.7992],
          [0.6041, 0.6341, 0.6792,  ..., 0.5741, 0

In [7]:
import os
import json

# Load the JSON file
annotations_file = r"C:\Users\sumay\Desktop\AI_Image_Tagging\data\captions.json"
with open(annotations_file, "r") as f:
    data = json.load(f)

# Check for missing files
image_dir = r"C:\Users\sumay\Desktop\AI_Image_Tagging\data\train"
missing_files = []

for entry in data["annotations"]:
    image_path = os.path.join(image_dir, entry["image_id"])
    if not os.path.exists(image_path):
        missing_files.append(entry["image_id"])

# Print missing files
if missing_files:
    print("Missing files:")
    for file in missing_files:
        print(file)
else:
    print("All files exist.")

All files exist.


In [10]:
# Load dataset
dataset = ImageCaptionDataset(annotations_file, image_dir, processor)

# Load BLIP model
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

Loaded 852 captions.


In [12]:
def data_collator(batch):
    input_ids = torch.cat([item["input_ids"] for item in batch])
    pixel_values = torch.cat([item["pixel_values"] for item in batch])
    labels = torch.cat([item["labels"] for item in batch])
    attention_mask = torch.cat([item["attention_mask"] for item in batch])  #I Added this line beacause it was rocomendded

    return {
        "input_ids": input_ids,
        "pixel_values": pixel_values,
        "labels": labels,
        "attention_mask": attention_mask,  # Include the attention mask
    }

In [14]:
training_args = TrainingArguments(
    output_dir="./blip_finetuned",
    per_device_train_batch_size=8,  # Increased batch size
    num_train_epochs=10,  # Increased epochs
    save_steps=400,  # Save steps is now a multiple of eval_steps (200 * 2 = 400) --> needed 
    eval_strategy="steps",
    eval_steps=200,  # Evaluate every 200 steps
    logging_steps=50,  # Log less frequently
    save_total_limit=3,
    resume_from_checkpoint=False,  # Start fresh
    learning_rate=3e-5,  # Lower learning rate
    weight_decay=0.001,  # Reduced weight decay
    push_to_hub=False,
    fp16=False,  # Disable mixed precision
    gradient_accumulation_steps=2,  # Simulate larger batch size
    load_best_model_at_end=True,  # Early stopping
    metric_for_best_model="eval_loss",  # Use evaluation loss for best model
)

In [16]:
import os
from transformers import Trainer, EarlyStoppingCallback

print(f"Total dataset size: {len(dataset)}")  # Debug print

if len(dataset) > 1:
    train_size = max(1, int(0.9 * len(dataset)))  # Use 90% for training
    val_size = len(dataset) - train_size
    print(f"Train size: {train_size}, Val size: {val_size}")  # Debug print
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
else:
    train_dataset = dataset  # Use the entire dataset for training
    val_dataset = None       # No validation set

# Define the checkpoint directory
checkpoint_dir = "./blip_finetuned"

# Check if there's an existing checkpoint
resume_checkpoint = None
if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
    resume_checkpoint = checkpoint_dir  # Automatically resume from the last checkpoint
    print(f"Resuming from checkpoint: {resume_checkpoint}")

# Add Early Stopping Callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop if validation loss doesn’t improve for 3 evaluations
    early_stopping_threshold=0.001  # Minimum required improvement in loss
)

# Initialize Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping]  # Add early stopping callback
)

# Train the model with progress tracking and checkpoint resuming
trainer.train(resume_from_checkpoint=resume_checkpoint)

# Save the fine-tuned model
trainer.save_model(checkpoint_dir)
processor.save_pretrained(checkpoint_dir)

print("Fine-tuning complete! Model saved to ./blip_finetuned")


Total dataset size: 852
Train size: 766, Val size: 86
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_1715.PNG
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Children's_Strollers (10).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Bracelets (1).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0314.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0569.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0564.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\ring (17).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Keychains (1).PNG
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Glasses (19).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0289.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\ring (21).jpg
Loading image: C:\Users\sumay\Desktop\AI_Ima

Step,Training Loss,Validation Loss
200,0.1904,0.143252
400,0.0114,0.094122


Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0353.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_1615.PNG
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\iphone11 (23).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\iphone_promax (23).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Carry_On (16).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\IMG_0349.jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Car_Keys (16).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\ring (18).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Glasses (13).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Neck Pillows (19).PNG
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\backpack (8).jpg
Loading image: C:\Users\sumay\Desktop\AI_Image_Tagging\data\train\Neck Pillows (9).PNG
L

There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.bias'].


Fine-tuning complete! Model saved to ./blip_finetuned


In [18]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

# Load the fine-tuned model
model_path = "./blip_finetuned"  
model = BlipForConditionalGeneration.from_pretrained(model_path)
processor = BlipProcessor.from_pretrained(model_path)

# can't work on cuda still :(
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [44]:
img_path = r"C:\Users\sumay\Pictures\Screenshots\Screenshot 2025-02-06 232635.png"
image = Image.open(img_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(**inputs, num_beams=5, max_length=90)

caption = processor.batch_decode(output, skip_special_tokens=True)[0]
print(f"Caption: {caption}")


Caption: category : other personal belongings, tag : sunglasses, a brown tinted square sunglasses with bold black frame


In [22]:
import shutil
import os

# Path to the fine-tuned model directory
fine_tuned_model_dir = "./blip_finetuned"

# Check if the directory exists and delete it
if os.path.exists(fine_tuned_model_dir):
    shutil.rmtree(fine_tuned_model_dir)
    print(f"Deleted fine-tuned model directory: {fine_tuned_model_dir}")
else:
    print(f"No fine-tuned model directory found at: {fine_tuned_model_dir}")

Deleted fine-tuned model directory: ./blip_finetuned
