In [3]:
pip install torch transformers pillow

Collecting torch
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# AI Image Caption Generator
# This script uses a pre-trained BLIP model to generate captions for input images

# Import necessary libraries
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import warnings

# Suppress warnings to keep output clean (optional)
warnings.filterwarnings('ignore')

def initialize_model():
    """
    Initialize the BLIP model and processor from Hugging Face.
    Returns:
        processor: BLIP processor for image preprocessing
        model: BLIP model for caption generation
    """
    # Load the processor and model
    print("⏳ Loading BLIP model... (This may take a moment)")
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    print(f"✅ Model loaded successfully on {device.upper()}!")
    return processor, model

# Add this new function and modify the load_image function

def clean_image_path(image_path):
    """Remove surrounding quotes from path if present"""
    return image_path.strip('"\'')

def load_image(image_path):
    """
    Load and validate an image from the given path.
    Args:
        image_path: Path to the image file
    Returns:
        PIL Image object
    Raises:
        FileNotFoundError: If image doesn't exist
        ValueError: If file is not a valid image
    """
    try:
        # Clean the path first
        clean_path = clean_image_path(image_path)
        image = Image.open(clean_path)
        if image.mode != 'RGB':
            image = image.convert('RGB')
        return image
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: The file '{clean_path}' does not exist.")
    except Exception as e:
        raise ValueError(f"Error: Unable to open image. {str(e)}")

# The rest of the script remains the same

def generate_caption(processor, model, image, use_beam_search=True):
    """
    Generate a caption for the given image.
    Args:
        processor: BLIP processor
        model: BLIP model
        image: PIL Image object
        use_beam_search: Whether to use beam search for generation (better but slower)
    Returns:
        Generated caption string
    """
    # Preprocess the image
    inputs = processor(image, return_tensors="pt").to(model.device)

    # Generate caption
    print("🧠 Generating caption...")
    if use_beam_search:
        # Beam search for better quality (slower)
        generated_ids = model.generate(**inputs, max_length=50, num_beams=5)
    else:
        # Greedy search for faster results
        generated_ids = model.generate(**inputs, max_length=50)

    # Decode and return the caption
    caption = processor.decode(generated_ids[0], skip_special_tokens=True)
    return caption.capitalize()

def main():
    """
    Main function to run the image caption generator.
    """
    print("\n" + "="*50)
    print("🖼️  AI Image Caption Generator")
    print("="*50 + "\n")

    # Initialize model
    processor, model = initialize_model()

    # Get image path from user
    while True:
        image_path = input("📂 Enter the path to your image (or 'quit' to exit): ").strip()

        if image_path.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            return

        try:
            # Load image
            image = load_image(image_path)

            # Generate and display caption
            caption = generate_caption(processor, model, image)

            print("\n" + "="*50)
            print("📝 Generated Caption:")
            print(f"✨ {caption}")
            print("="*50 + "\n")

        except Exception as e:
            print(f"❌ Error: {str(e)}\n")

if __name__ == "__main__":
    main()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



🖼️  AI Image Caption Generator

⏳ Loading BLIP model... (This may take a moment)
✅ Model loaded successfully on CPU!
🧠 Generating caption...

📝 Generated Caption:
✨ A group of people standing on a beach with a pirate ship in the background

🧠 Generating caption...

📝 Generated Caption:
✨ A man standing on top of a mountain with a red sky in the background

👋 Goodbye!
