In [2]:
!pip -q install --upgrade pip
!pip -q install transformers pillow timm accelerate requests


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.7/1.8 MB[0m [31m51.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os, io, re, time, typing as T
from dataclasses import dataclass
import requests
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")   # Apple Silicon
else:
    device = torch.device("cpu")

print("Using device:", device)
torch_dtype = torch.float16 if device.type in ["cuda", "mps"] else torch.float32

_ = torch.manual_seed(42)
if device.type == "cuda":
    _ = torch.cuda.manual_seed_all(42)


Using device: cuda


In [4]:
URL_RE = re.compile(r'^https?://', re.IGNORECASE)

def is_url(s: str) -> bool:
    return bool(URL_RE.match(s or ""))

def load_image(source: T.Union[str, Image.Image]) -> Image.Image:
    if isinstance(source, Image.Image):
        return source.convert("RGB")
    if not isinstance(source, str):
        raise TypeError(f"Unsupported image source type: {type(source)}")
    if is_url(source):
        resp = requests.get(source, timeout=15)
        resp.raise_for_status()
        return Image.open(io.BytesIO(resp.content)).convert("RGB")
    else:
        if not os.path.exists(source):
            raise FileNotFoundError(f"File not found: {source}")
        return Image.open(source).convert("RGB")


In [5]:
@dataclass
class GenerationConfig:
    max_length: int = 32
    num_beams: int = 3
    do_sample: bool = False
    repetition_penalty: float = 1.1
    temperature: float = 1.0
    top_p: float = 0.9

class BLIPCaptioner:
    def __init__(self, model_name: str = "Salesforce/blip-image-captioning-base",
                 torch_dtype: torch.dtype = torch.float16):
        self.model_name = model_name
        print(f"Loading model: {model_name}")
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch_dtype if device.type in ["cuda", "mps"] else torch.float32,
            low_cpu_mem_usage=True
        ).to(device)
        self.model.eval()

    @torch.inference_mode()
    def caption(self, image_source: T.Union[str, Image.Image],
                prompt: str = None, gen: GenerationConfig = GenerationConfig()) -> str:
        img = load_image(image_source)
        if prompt and prompt.strip():
            inputs = self.processor(images=img, text=prompt.strip(), return_tensors="pt").to(device, dtype=self.model.dtype)
        else:
            inputs = self.processor(images=img, return_tensors="pt").to(device, dtype=self.model.dtype)
        out_ids = self.model.generate(
            **inputs,
            max_length=gen.max_length,
            num_beams=gen.num_beams,
            do_sample=gen.do_sample,
            repetition_penalty=gen.repetition_penalty,
            temperature=gen.temperature if gen.do_sample else None,
            top_p=gen.top_p if gen.do_sample else None,
        )
        return self.processor.decode(out_ids[0], skip_special_tokens=True)

    @torch.inference_mode()
    def caption_batch(self, sources: T.List[T.Union[str, Image.Image]],
                      prompt: str = None, gen: GenerationConfig = GenerationConfig()) -> T.List[str]:
        images = [load_image(s) for s in sources]
        if prompt and prompt.strip():
            inputs = self.processor(images=images, text=[prompt]*len(images), return_tensors="pt", padding=True).to(device, dtype=self.model.dtype)
        else:
            inputs = self.processor(images=images, return_tensors="pt", padding=True).to(device, dtype=self.model.dtype)
        out_ids = self.model.generate(
            **inputs,
            max_length=gen.max_length,
            num_beams=gen.num_beams,
            do_sample=gen.do_sample,
            repetition_penalty=gen.repetition_penalty,
            temperature=gen.temperature if gen.do_sample else None,
            top_p=gen.top_p if gen.do_sample else None,
        )
        return [c.strip() for c in self.processor.batch_decode(out_ids, skip_special_tokens=True)]


In [6]:
model_name = "Salesforce/blip-image-captioning-base"  # or "Salesforce/blip-image-captioning-large"
cap = BLIPCaptioner(model_name=model_name, torch_dtype=torch_dtype)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading model: Salesforce/blip-image-captioning-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [7]:
test_url = "https://images.unsplash.com/photo-1500530855697-b586d89ba3ee?w=1200"
print("Caption:", cap.caption(test_url))

from IPython.display import display
display(load_image(test_url))

Output hidden; open in https://colab.research.google.com to view.

In [8]:
# Option A: local path
# local_path = "/path/to/your/image.jpg"
# print("Caption:", cap.caption(local_path))

# Option B: Colab upload
try:
    from google.colab import files
    up = files.upload()
    local_path = list(up.keys())[0]
    print("Uploaded:", local_path)
    print("Caption:", cap.caption(local_path))
    from IPython.display import display
    display(load_image(local_path))
except Exception as e:
    print("Colab upload not available (or skipped). Set local_path manually above if needed.")


Colab upload not available (or skipped). Set local_path manually above if needed.


In [9]:
gen_cfg = GenerationConfig(max_length=40, num_beams=5)
prompt = "a concise caption describing the main objects and scene"
src = "https://images.unsplash.com/photo-1519681393784-d120267933ba?w=1200"
print("Prompted caption:", cap.caption(src, prompt=prompt, gen=gen_cfg))

Prompted caption: a concise caption describing the main objects and scene of the night sky


In [10]:
sources = [
    "https://images.unsplash.com/photo-1507525428034-b723cf961d3e?w=1200",
    "https://images.unsplash.com/photo-1500534314209-a25ddb2bd429?w=1200",
]
captions = cap.caption_batch(sources, prompt=None, gen=GenerationConfig(max_length=30, num_beams=3))
for s, c in zip(sources, captions):
    print("\nImage:", s)
    print("Caption:", c)



Image: https://images.unsplash.com/photo-1507525428034-b723cf961d3e?w=1200
Caption: a beautiful sunset on the beach in the bahamas islands

Image: https://images.unsplash.com/photo-1500534314209-a25ddb2bd429?w=1200
Caption: a view of the mountains from the top of a hill


In [11]:
import csv
from datetime import datetime

def save_captions_to_csv(paths_or_urls, captions, out_path="captions.csv"):
    rows = [("source", "caption", "timestamp")]
    now = datetime.utcnow().isoformat()
    for s, c in zip(paths_or_urls, captions):
        rows.append((s, c, now))
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        csv.writer(f).writerows(rows)
    return out_path

# Example:
# out_csv = save_captions_to_csv(sources, captions, out_path="captions.csv")
# print("Saved:", out_csv)


In [12]:
try:
    import gradio as gr

    def _caption_image(image, prompt):
        if image is None:
            return ""
        return cap.caption(image, prompt=prompt or None)

    with gr.Blocks() as demo:
        gr.Markdown("# BLIP Captioner — Demo")
        with gr.Row():
            with gr.Column():
                img_input = gr.Image(type="pil", label="Upload an image")
                prompt_in = gr.Textbox(label="Prompt (optional)", placeholder="e.g., 'a concise caption of the main scene'")
                btn = gr.Button("Caption")
            with gr.Column():
                out = gr.Textbox(label="Caption", lines=3)
        btn.click(_caption_image, inputs=[img_input, prompt_in], outputs=[out])
    demo.launch(share=False)
except Exception as e:
    print("Gradio not installed or failed to launch:", e)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>