In [7]:
# Load model directly
from transformers import pipeline, AutoProcessor, AutoModelForImageTextToText
from PIL import Image 
import requests
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large",use_fast=True)
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large")

In [None]:
image_url = "https://images.ctfassets.net/awb1we50v0om/6tE2cm5qXWYZqMSNmJ8olk/7f9291c8d27a45b9cdef60e5965e43a1/Recipe2.jpg?w=1920&fm=webp&q=70"  # Indian samosa with chutney

image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

# Prepare inputs with optional prompt
prompt = "Describe the food in the image."
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=50)
caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("🧾 Caption:", caption)

🧾 Caption: describe the food in the image. the burger and fries


In [5]:
# Load model directly
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification

processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14",use_fast=True)
model = AutoModelForZeroShotImageClassification.from_pretrained("openai/clip-vit-large-patch14")

In [11]:
import requests
from io import BytesIO
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

def predict_from_url(
    image_url: str,
    labels: list[str],
    topk: int = 5,
    model_name: str = "openai/clip-vit-base-patch32"
):
    # 0. Load CLIP processor & model (on first call; cached thereafter)
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name).eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 1. Download the image
    resp = requests.get(image_url)
    resp.raise_for_status()
    img = Image.open(BytesIO(resp.content)).convert("RGB")

    # 2. Prepare inputs: text prompts + image, padded to max length
    inputs = processor(
        text=labels,
        images=img,
        return_tensors="pt",
        padding=True
    ).to(device)

    # 3. Forward pass through CLIP
    with torch.no_grad():
        outputs = model(**inputs)

    # 4. Compute cosine similarity between image embeddings and text embeddings
    image_embeds = outputs.image_embeds            # (1, hidden_dim)
    text_embeds = outputs.text_embeds              # (len(labels), hidden_dim)

    # normalize
    image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
    text_embeds  = text_embeds  / text_embeds.norm(p=2, dim=-1, keepdim=True)

    # cosine similarity → logits
    logits = (100.0 * image_embeds @ text_embeds.T).squeeze(0)  # (len(labels),)

    # 5. Top-k
    probs = logits.softmax(dim=0)
    values, indices = probs.topk(topk)

    print(f"Top {topk} predictions for {image_url}:")
    for score, idx in zip(values, indices):
        label = labels[idx.item()]
        print(f"  {label}: {score.item():.4f}")

if __name__ == "__main__":
    labels = [
        "apple", "banana", "pizza", "sandwich", "orange",
        "broccoli", "donut", "carrot", "steak", "salad",
        "burger", "fries"
    ]
    image_url = (
        "https://images.ctfassets.net/awb1we50v0om/"
        "6tE2cm5qXWYZqMSNmJ8olk/7f9291c8d27a45b9cdef60e5965e43a1/"
        "Recipe2.jpg?w=1920&fm=webp&q=70"
    )
    predict_from_url(image_url, labels, topk=5)




Top 5 predictions for https://images.ctfassets.net/awb1we50v0om/6tE2cm5qXWYZqMSNmJ8olk/7f9291c8d27a45b9cdef60e5965e43a1/Recipe2.jpg?w=1920&fm=webp&q=70:
  burger: 0.9180
  fries: 0.0393
  steak: 0.0337
  sandwich: 0.0082
  salad: 0.0005


In [None]:
##Blip-2
# Load model directly
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip2-opt-2.7b")

Fetching 2 files:   0%|          | 0/2 [06:05<?, ?it/s]
