In [None]:
from pymongo import MongoClient
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPProcessor, CLIPModel
import torch

# connect to Mongo
client = MongoClient("mongodb+srv://rahul:Dhruvie12345@itemset.zl2kjim.mongodb.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

# Retrieve one item from the collection
try:
    document = collection.find_one()
    if document:
        print("Read access confirmed. Sample document:", document)
    else:
        print("Connected but no documents found in the collection.")
    model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
    processor = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")
except Exception as e:
    print("Read access failed:", str(e))

In [None]:
# Connect to the MongoDB database
client = MongoClient("mongodb+srv://rahul:Dhruvie12345@itemset.zl2kjim.mongodb.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

def get_embedding(input_data, input_type='image'):
    if input_type == 'image':
        if isinstance(input_data, str):  # If it's a URL
            response = requests.get(input_data)
            image = Image.open(BytesIO(response.content))
        else:  # If it's already a PIL Image
            image = input_data
        
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            features = model.get_image_features(**inputs)
    elif input_type == 'text':
        inputs = processor(text=input_data, return_tensors="pt", padding=True)
        with torch.no_grad():
            features = model.get_text_features(**inputs)
    else:
        raise ValueError("Invalid input_type. Must be 'image' or 'text'.")
    
    return features.squeeze().tolist()

from tqdm import tqdm

total_items = collection.count_documents({})
for item in tqdm(collection.find(), total=total_items, desc="Processing items"):
    if item['image_urls']:
        first_image_url = item['image_urls'][0]
        short_description = item['short_description']
        long_description = item['long_description']
        text = f"{short_description} {long_description}"
        # print("first_image_url:", first_image_url, "text:", text)
        try:
            image_embedding = get_embedding(first_image_url, input_type='image')
            # text_embedding = get_embedding(text, input_type='text')
            
            collection.update_one(
                {'_id': item['_id']},
                {'$set': {
                    'image_embedding': image_embedding,
                    # 'text_embedding': text_embedding
                }}
            )
        except Exception as e:
            print(f"Error processing item {item['_id']}: {str(e)}")
    else:
        print(f"No image URLs for item {item['_id']}")

print("Finished updating embeddings")

In [None]:
from pymongo.operations import SearchIndexModel
from pymongo import MongoClient

# Connect to the MongoDB database
client = MongoClient("mongodb+srv://rahul:Dhruvie12345@itemset.zl2kjim.mongodb.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

# Attempt to drop the existing index if it exists
try:
    collection.drop_search_index("image_embedding_vector_index")
    print("Index 'image_embedding_vector_index' dropped successfully.")
except Exception as e:
    if "IndexNotFound" in str(e):
        print(str(e))
        print("Index 'image_embedding_vector_index' does not exist. Proceeding to create it.")
    else:
        raise

# Define the index
index_definition = {
    "mappings": {
        "dynamic": True,
        "fields": {
            "image_embedding": {
                "type": "knnVector",
                "dimensions": 512,
                "similarity": "cosine"
            }
        }
    }
}


# Create the SearchIndexModel
index_name = "image_embedding_vector_index"
search_index_model = SearchIndexModel(index_definition, name=index_name)

# Create the index
result = collection.create_search_index(search_index_model)

print(f"Index '{index_name}' created successfully. Result: {result}")

In [None]:
import time

text = "pink"
text_embedding = get_embedding(text, input_type='text')

query_image_url = "https://www.lulus.com/images/product/xlarge/8401921_1714576.jpg?w=195&hdpi=1"
query_embedding = get_embedding(query_image_url, input_type='image')

pipeline = [
    {
        "$vectorSearch": {
            "index": "image_embedding_vector_index",
            "path": "image_embedding",
            "queryVector": query_embedding,
            "numCandidates": 100,
            "limit": 10
        }
    },
    {
        "$project": {
            "item_id": 1,
            "item_url": 1,
            "product_brand": 1,
            "short_description": 1,
            "image_urls": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

start_time = time.time()
results = list(collection.aggregate(pipeline))
end_time = time.time()
print(f"MDB QUERY Time taken: {end_time - start_time:.4f} seconds")
results