<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/Vision_Encoder_Adapter_and_Fusion_Prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import base64
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

_CLIP_ID = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(_CLIP_ID)
clip_model = CLIPModel.from_pretrained(_CLIP_ID).to("cuda")

def image_to_embedding(img_path: str) -> torch.Tensor:
    """Load image and return its CLIP embedding (1, 512)."""
    im = Image.open(img_path).convert("RGB")
    inputs = processor(images=im, return_tensors="pt").to("cuda")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
    return features

def multimodal_prompt(img_emb: torch.Tensor, text_context: str, n: int = 3) -> str:
    """Build a multimodal prompt that incorporates image signature + text context."""
    # normalise for stability
    norm_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)
    # float16 for compactness, base16 for text safety
    img_sig = base64.b16encode(norm_emb.cpu().numpy().astype('float16')).decode()[:128]
    return (
        f"You are a multimodal researcher.\n"
        f"IMAGE_SIG: {img_sig}\n"
        f"Context:\n{text_context.strip()}\n\n"
        f"Generate {n} deep, open-ended questions integrating visual insights."
    )

if __name__ == "__main__":
    emb = image_to_embedding("example.jpg")
    prompt = multimodal_prompt(emb, "Microscopy image of novel crystalline structure", n=5)
    print(prompt)