# Image Vector Indexing and Text-to-Image Retrieval

This notebook will:
1. Load images from a directory  
2. Compute image embeddings using a pretrained vision model  
3. Build a FAISS index over those embeddings  
4. Encode a text query and retrieve the top-k most similar images  

In [None]:
# 1. Imports & Model Initialization
import os
import glob

import torch
import torch.nn.functional as F
import numpy as np
import faiss

from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

# Vision model to embed images
processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
vision_model.eval()

# Text model to embed queries
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5")
text_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
text_model.eval()

# Mean pooling helper for text embeddings
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = torch.sum(token_embeddings * mask_expanded, dim=1)
    counts = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
    return summed / counts

In [8]:
# 2. Load Images and Compute Embeddings
IMAGE_DIR = "images"  # adjust path as needed
image_paths = sorted(glob.glob(os.path.join(IMAGE_DIR, "*.[jp][pn]g")))

image_embeddings = []
for path in image_paths:
    img = Image.open(path).convert("RGB")
    inputs = processor(img, return_tensors="pt")
    with torch.no_grad():
        out = vision_model(**inputs).last_hidden_state
    emb = F.normalize(out[:, 0], p=2, dim=1)  # take [CLS] token
    image_embeddings.append(emb.cpu().numpy().astype("float32"))

# Stack into array of shape (N, D)
image_embeddings = np.vstack(image_embeddings)
print(f"Loaded {len(image_paths)} images, embedding dim = {image_embeddings.shape[1]}")

Loaded 6 images, embedding dim = 768


In [9]:
# 3. Build FAISS Index
dim = image_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(image_embeddings)
print(f"FAISS index contains {index.ntotal} vectors")

FAISS index contains 6 vectors


In [10]:
# 4. Define Text-to-Image Search Function
def search_images_by_text(query, k=5):
    # Encode text
    encoded = tokenizer([query], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        txt_out = text_model(**encoded)
    txt_emb = mean_pooling(txt_out, encoded["attention_mask"])
    txt_emb = F.layer_norm(txt_emb, normalized_shape=(txt_emb.shape[1],))
    txt_emb = F.normalize(txt_emb, p=2, dim=1).cpu().numpy().astype("float32")
    
    # Search FAISS
    D, I = index.search(txt_emb, k)
    results = [(image_paths[idx], float(D[0][i])) for i, idx in enumerate(I[0])]
    return results

In [14]:
# 5. Run an Example Query
query = "winter village with snow"
top_results = search_images_by_text(query, k=5)

print(f"Top-5 matches for query: '{query}'\n")
for rank, (path, dist) in enumerate(top_results, start=1):
    print(f"{rank}. {path} (L2 distance = {dist:.4f})")

Top-5 matches for query: 'winter village with snow'

1. images\winter-village-scene-snowy-night-town.jpg (L2 distance = 1.7919)
2. images\winter_town.png (L2 distance = 1.8249)
3. images\snowman.jpg (L2 distance = 1.8722)
4. images\snow_forest.jpg (L2 distance = 1.8852)
5. images\beach-wave-sunset-coast-palm-tree-scenery.jpg (L2 distance = 1.9761)
