# Step 1: Install + Import Code

In [None]:
# 📌 Install required libraries
!pip install transformers torch pillow datasets

# ✅ Import libraries
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

# Step 2: Load the Pretrained CLIP Model & Processor

In [None]:
# ✅ Load the pretrained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

print("Model and processor loaded successfully!")

Model and processor loaded successfully!


# Step 3: Load an Image & Define Text Prompts

In [None]:
import io
from PIL import Image
import requests

# Use a public image URL
image_url = "https://images.unsplash.com/photo-1518791841217-8f162f1e1131"

response = requests.get(image_url)
if response.status_code == 200:
    image = Image.open(io.BytesIO(response.content))
    image.show()
else:
    print("Failed to download image:", response.status_code)


In [None]:
# 📷 Load an image from URL
image = Image.open(requests.get(image_url, stream=True).raw)
image.show()

# Step 4: Encode Inputs & Get Predictions

In [None]:
# 📝 Define some candidate text prompts
texts = ["a photo of a cat", "a photo of a dog", "a photo of a horse", "a photo of a rabbit"]

# 🔄 Process image and text using the processor
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)

# 🧠 Run inference through the model
outputs = model(**inputs)

# 🎯 Extract logits (similarity scores between image and text)
logits_per_image = outputs.logits_per_image

# 🔢 Convert logits to probabilities
probs = logits_per_image.softmax(dim=1)

# 🖨️ Print predictions
for text, prob in zip(texts, probs[0]):
    print(f"{text}: {prob.item():.4f}")


a photo of a cat: 0.9866
a photo of a dog: 0.0057
a photo of a horse: 0.0003
a photo of a rabbit: 0.0073


# Step 5: Load BLIP Model + Processor

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

# ✅ Load pretrained BLIP processor and model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

print("BLIP model and processor loaded successfully!")


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

BLIP model and processor loaded successfully!


# Step 6: Generate Caption for the Image

In [None]:
# 🔄 Process the image for BLIP
inputs = blip_processor(images=image, return_tensors="pt")

# 🧠 Generate caption
output = blip_model.generate(**inputs)

# 🖨️ Decode and print caption
caption = blip_processor.decode(output[0], skip_special_tokens=True)
print("Generated caption:", caption)


Generated caption: a cat sitting on a couch with a pink pillow


# Step 7: Load BLIP-2 Model + Processor

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# ✅ Load BLIP-2 processor and model (optimum pretrained checkpoint)
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

print("BLIP-2 model and processor loaded successfully!")


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

BLIP-2 model and processor loaded successfully!


In [None]:
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

# Step 8 : Generate Caption with BLIP-2

In [None]:
# Process the image
inputs = blip2_processor(images=image, return_tensors="pt")

# Generate caption
output = blip2_model.generate(**inputs, max_new_tokens=50)

# Decode and print caption
caption = blip2_processor.decode(output[0], skip_special_tokens=True)
print("BLIP-2 Generated caption:", caption)

BLIP-2 Generated caption: a tabby cat is sitting on a couch


# Step 9: Visual Question Answering (VQA)

In [None]:
# Define your question
question = "What animal is in the picture?"

# Process image + question
inputs = blip2_processor(images=image, text=question, return_tensors="pt")

# Generate answer
output = blip2_model.generate(**inputs, max_new_tokens=50)

# Decode and print answer
answer = blip2_processor.decode(output[0], skip_special_tokens=True)
print(f"Q: {question}")
print(f"A: {answer}")

Q: What animal is in the picture?
A: cat
