<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/Multimodal_Fusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install open_clip_torch

In [None]:
import torch
import open_clip
from PIL import Image

# 1. Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Load model + preprocessing
clip_model, _, clip_pre = open_clip.create_model_and_transforms(
    "ViT-B-32",
    pretrained="laion2b_s34b_b79k"
)
clip_model = clip_model.to(device).eval().requires_grad_(False)

# 3. Batch multimodal encoding
def encode_multimodal_batch(images, texts, normalize=True):
    """
    images: list of PIL.Image or tensors
    texts: list of strings
    Returns: tensor of shape (batch_size, 2D)
    """
    # Preprocess all images into a batch
    img_tensors = torch.stack([clip_pre(img) for img in images]).to(device)
    txt_tokens = open_clip.tokenize(texts).to(device)

    with torch.no_grad():
        img_f = clip_model.encode_image(img_tensors)   # (B, D)
        txt_f = clip_model.encode_text(txt_tokens)     # (B, D)

    if normalize:
        img_f = img_f / img_f.norm(dim=-1, keepdim=True)
        txt_f = txt_f / txt_f.norm(dim=-1, keepdim=True)

    # Concatenate image + text features
    combined = torch.cat([img_f, txt_f], dim=-1)       # (B, 2D)
    return combined

# 4. Example usage
if __name__ == "__main__":
    # Example data
    images = [
        Image.open("cat.jpg"),
        Image.open("dog.jpg")
    ]
    texts = [
        "a cute cat",
        "a happy dog"
    ]

    embeddings = encode_multimodal_batch(images, texts)
    print("Embedding shape:", embeddings.shape)  # (2, 1024) if D=512
    print("First vector snippet:", embeddings[0][:10])  # show first 10 dims