In [14]:
import os
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import chromadb
from chromadb.config import Settings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import shutil
import pandas as pd


In [15]:
# Load model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", use_safetensors=True)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [16]:
# Load image and text
image = Image.open("/Users/christinecym/Desktop/multimodal-search/image/1.jpg")  # replace with your image path
text = "A Die DIY Kit"

# Preprocess and encode
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

# Get embeddings
image_embedding = outputs.image_embeds[0].detach().numpy()
text_embedding = outputs.text_embeds[0].detach().numpy()

In [17]:
def embed_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        emb = model.get_image_features(**inputs)
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.squeeze().cpu().numpy().astype(np.float32).tolist()

In [18]:
def embed_text(text):
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        emb = model.get_text_features(**inputs)
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.squeeze().cpu().numpy().astype(np.float32).tolist()

In [19]:
# # Initialize the vector data embedding
# chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
# collection = chroma_client.create_collection(name = 'product_images')

# Connect to persistent ChromaDB instance
chroma_client = chromadb.Client(Settings(persist_directory="chroma_storage", anonymized_telemetry=False))

# Or just delete the whole collection and recreate
collection = chroma_client.get_or_create_collection("product_images", embedding_function=None)


In [20]:
# Load metadata
df = pd.read_csv("/Users/christinecym/Desktop/multimodal-search/archive/amazon_products.csv")
metadata_lookup = {
    row["file_name"]: {
        "file_name": row["file_name"],
        "product_name": row["product_name"],
        "amazon_url": row["amazon_url"]
    }
    for _, row in df.iterrows()
}

# Load and Embed Product Images
image_path = "/Users/christinecym/Desktop/multimodal-search/image"
image_num = 0
for file in os.listdir(image_path):
    if file.lower().endswith(('.jpg', '.jpeg', '.png')):
        path = os.path.join(image_path, file)
        image_id = os.path.splitext(file)[0]
        embedding = embed_image(path)

        meta = metadata_lookup.get(file, {"file_name": file, "amazon_url": "#", "product_name": "Unknown Product"})

        collection.add(
            ids=[image_id],
            embeddings=[embedding],
            metadatas=[meta]
        )

        image_num += 1
        print(f"Embedding stored for: {file}")
        
print(f"\n Total images processed: {image_num}")



KeyError: 'file_name'

In [10]:
# Retriever
def retrieve_similar_products(query_text, top_k = 1):
    
    print(f"\n Encoding query: '{query_text}")
    
    query_vector = embed_text(query_text)
    print(f"\n Query encoded. Searching database...")
    
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["embeddings", "metadatas"] 
    )
    
    print("Top results retreived.")
    return results

In [11]:
# Calculate cosine similarity
def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(
        torch.tensor(a), torch.tensor(b), dim=0
    ).item()

In [12]:
# Run text query
query = "dye kit"
results = retrieve_similar_products(query)
query_vec = embed_text(query)


# Filter out only image-type results
image_results = [
    (meta, emb) for meta, emb in zip(results["metadatas"][0], results["embeddings"][0])
    if meta and (
        meta.get("type") == "image" or meta.get("file_name", "").lower().endswith(('.jpg', '.jpeg', '.png'))
    )
]



if not image_results:
    print(" No image results found.")
else:
    meta, top_result_embedding = image_results[0]
    file_name = meta.get("file_name", "unknown.jpg")

    # Create IDs
    text_id = f"text_{query.replace(' ', '_')}"
    image_id = f"image_{os.path.splitext(file_name)[0]}"

    # Save to ChromaDB
    assert isinstance(query_vec, list), "Text embedding must be a list"
    collection.add(
        ids=[text_id],
        embeddings=[query_vec],
        metadatas=[{"type": "text", "query": query}]
    )


all_items = collection.get(include=["embeddings", "metadatas"])

print("IDs:", all_items["ids"])
print("Metadata:", all_items["metadatas"])
print("Embeddings:", all_items["embeddings"])








 Encoding query: 'dye kit

 Query encoded. Searching database...
Top results retreived.
IDs: ['8', '9', '14', '15', '17', '16', '12', '13', '11', '10', '20', '18', '19', '4', '5', '7', '6', '2', '3', '1', 'text_dye_kit']
Metadata: [{'file_name': '8.jpg'}, {'file_name': '9.jpg'}, {'file_name': '14.jpg'}, {'file_name': '15.jpg'}, {'file_name': '17.jpg'}, {'file_name': '16.jpg'}, {'file_name': '12.jpg'}, {'file_name': '13.jpg'}, {'file_name': '11.jpg'}, {'file_name': '10.jpg'}, {'file_name': '20.jpg'}, {'file_name': '18.jpg'}, {'file_name': '19.jpg'}, {'file_name': '4.jpg'}, {'file_name': '5.jpg'}, {'file_name': '7.jpg'}, {'file_name': '6.jpg'}, {'file_name': '2.jpg'}, {'file_name': '3.jpg'}, {'file_name': '1.jpg'}, {'type': 'text', 'query': 'dye kit'}]
Embeddings: [[ 0.03759085  0.02897384 -0.01279535 ...  0.0330596  -0.09382622
  -0.02085317]
 [-0.03361781 -0.02324725  0.02442953 ...  0.08871351 -0.01979954
   0.03376737]
 [ 0.01341182  0.01927202 -0.05148533 ...  0.05106765  0.0234710

In [12]:
# Calculate cosine similarity
def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(
        torch.tensor(a), torch.tensor(b), dim=0
    ).item()

similarity = cosine_similarity(image_embedding, text_embedding)
print(f"Cosine similarity between image and text: {similarity:.4f}")

Cosine similarity between image and text: 0.2236


In [None]:
import chromadb
from chromadb.utils import embedding_functions

# Create a ChromaDB client and collection
client = chromadb.Client()
collection = client.create_collection(name="clip_embeddings")

# Optionally define a custom embedding function (not needed here, we use precomputed)
# Insert items
collection.add(
    documents=["A photo of a cat"],
    embeddings=[text_embedding.tolist()],  # convert numpy array to list
    ids=["text1"]
)

collection.add(
    documents=["example.jpg"],  # you can use the filename or any identifier
    embeddings=[image_embedding.tolist()],
    ids=["image1"]
)

In [None]:
# Query similar embeddings
results = collection.query(
    query_embeddings=[text_embedding.tolist()],
    n_results=3
)

print(results)

{'ids': [['text1', 'image1']], 'embeddings': None, 'documents': [['A photo of a cat', 'example.jpg']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[0.0, 1.4534668922424316]]}
