In [1]:
import os
import torch
import numpy as np
import requests
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import replicate

In [None]:
if "REPLICATE_API_TOKEN" not in os.environ:
    print("\n⚠️  WARNING: REPLICATE_API_TOKEN not found in environment variables")
    print("Please set it with: export REPLICATE_API_TOKEN='your-token-here'")
    print("Or uncomment and set it below:")
    #os.environ['REPLICATE_API_TOKEN'] = ''
else:
    print("✓ Replicate API token found")

✓ Replicate API token found


In [6]:
model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)

# Load model on CPU first (safest for Mac)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    dtype=torch.float32,  # Use float32 for CPU
    low_cpu_mem_usage=True
)

# Move to best available device
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = model.to(device)
    print("✓ Using CUDA GPU")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device("mps")
    model = model.to(device)
    print("✓ Using Apple Silicon MPS")
else:
    device = torch.device("cpu")
    print("✓ Using CPU (this will be slower)")

print(f"✓ Model loaded on device: {device}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
from datasets import load_dataset

# Much smaller and more reliable
dataset = load_dataset("nlphuji/flickr30k", split="test", streaming=True)
image_0 = next(iter(dataset))['image']

In [14]:
print(type(image_0))
print(image_0)

<class 'PIL.JpegImagePlugin.JpegImageFile'>
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=333x500 at 0x1DC91EED0>


In [15]:
from PIL import Image

# Ensure it's a PIL Image
if not isinstance(image_0, Image.Image):
    image_0 = Image.fromarray(image_0)

# Or if it's already a PIL image, make sure it's in RGB mode
image_0 = image_0.convert('RGB')

prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"
inputs = processor(text=prompt, images=image_0, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100)
caption_1 = processor.decode(output[0], skip_special_tokens=True)

Unused or unrecognized kwargs: do_pad.


KeyError: 'image_sizes'

In [13]:
prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"
inputs = processor(text=prompt, images=image_0, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100)
caption_1 = processor.decode(output[0], skip_special_tokens=True)

Unused or unrecognized kwargs: do_pad.


KeyError: 'image_sizes'

In [None]:
output_url = replicate.run(
    "stability-ai/sdxl",
    input={"prompt": caption_1}
)
image_1 = Image.open(requests.get(output_url[0], stream=True).raw)

In [None]:
inputs = processor(prompt, image_1, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100)
caption_2 = processor.decode(output[0], skip_special_tokens=True)

In [None]:
from sentence_transformers import SentenceTransformer
sim_model = SentenceTransformer('all-MiniLM-L6-v2')
emb1 = sim_model.encode(caption_1)
emb2 = sim_model.encode(caption_2)
similarity = (emb1 @ emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

print(f"Caption 1: {caption_1}")
print(f"Caption 2: {caption_2}")
print(f"Semantic similarity: {similarity:.3f}")