In [1]:
from pymongo import MongoClient

# connect to Mongo
client = MongoClient("mongodb+srv:.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

# Retrieve one item from the collection
try:
    document = collection.find_one()
    if document:
        print("Read access confirmed. Sample document:", document)
    else:
        print("Connected but no documents found in the collection.")
except Exception as e:
    print("Read access failed:", str(e))

Read access confirmed. Sample document: {'_id': ObjectId('648437b0f0f526c7794ff3ea'), 'item_id': 3, 'item_url': 'https://www.urbanoutfitters.com/shop/uo-wildflower-lace-babydoll-mini-dress?category=dresses&color=053&type=REGULAR', 'product_brand': 'Urban Outfitters', 'short_description': 'Wildflower Lace Mini Dress', 'price': 69, 'colors': 'White', 'long_description': 'WE NO LONGER SELL URBAN OUTFITTERS. Babydoll mini dress with lacey detailing at the neckline + sleeves and eyelet flowers at the hem. Cut with a square neckline, an empire waist and a swingy skirt. Complete with a button placket at the front and an adjustable tie at the back. Only at UO.', 'image_urls': ['https://images.urbndata.com/is/image/UrbanOutfitters/79607115_053_b?$xlarge$&fit=constrain&qlt=80&wid=640', 'https://images.urbndata.com/is/image/UrbanOutfitters/79607115_053_d?$xlarge$&fit=constrain&qlt=80&wid=640', 'https://images.urbndata.com/is/image/UrbanOutfitters/79607115_053_e?$xlarge$&fit=constrain&qlt=80&wid=6

In [4]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPProcessor, CLIPModel
import torch


model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
processor = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")

In [5]:
from pymongo import MongoClient
import requests
from PIL import Image
from io import BytesIO
import torch

# Connect to the MongoDB database
client = MongoClient("mongodb+srdb.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

def get_image_embedding(image_url):

    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    
    return image_features.squeeze().tolist()

from tqdm import tqdm

total_items = collection.count_documents({})
for item in tqdm(collection.find(), total=total_items, desc="Processing items"):
    if item['image_urls']:
        first_image_url = item['image_urls'][0]
        try:
            embedding = get_image_embedding(first_image_url)
            
            collection.update_one(
                {'_id': item['_id']},
                {'$set': {'image_embedding': embedding}}
            )
        except Exception as e:
            print(f"Error processing item {item['_id']}: {str(e)}")
    else:
        print(f"No image URLs for item {item['_id']}")

print("Finished updating embeddings")

Processing items:  33%|███▎      | 546/1633 [04:57<11:19,  1.60it/s]

Error processing item 648437e1f0f526c7794ff60b: cannot identify image file <_io.BytesIO object at 0x128fd45e0>


Processing items:  53%|█████▎    | 864/1633 [08:37<07:30,  1.71it/s]

Error processing item 648437ebf0f526c7794ff682: cannot identify image file <_io.BytesIO object at 0x1294e38d0>


Processing items:  63%|██████▎   | 1024/1633 [10:34<06:16,  1.62it/s]

Error processing item 64843814f0f526c7794ff847: cannot identify image file <_io.BytesIO object at 0x1299a00e0>


Processing items:  63%|██████▎   | 1036/1633 [10:45<08:52,  1.12it/s]

Error processing item 64843818f0f526c7794ff871: cannot identify image file <_io.BytesIO object at 0x1294e1350>


Processing items:  70%|███████   | 1151/1633 [12:30<05:36,  1.43it/s]

Error processing item 6484381ef0f526c7794ff8b2: cannot identify image file <_io.BytesIO object at 0x129c603b0>


Processing items:  72%|███████▏  | 1170/1633 [12:47<06:47,  1.14it/s]

Error processing item 64843824f0f526c7794ff8f2: cannot identify image file <_io.BytesIO object at 0x129cc80e0>


Processing items:  72%|███████▏  | 1177/1633 [12:52<05:10,  1.47it/s]

Error processing item 64843825f0f526c7794ff901: cannot identify image file <_io.BytesIO object at 0x129cc9210>


Processing items:  76%|███████▌  | 1240/1633 [13:45<04:11,  1.56it/s]

Error processing item 64843819f0f526c7794ff877: cannot identify image file <_io.BytesIO object at 0x129e19850>


Processing items:  77%|███████▋  | 1257/1633 [13:59<03:55,  1.60it/s]

Error processing item 64843821f0f526c7794ff8cf: cannot identify image file <_io.BytesIO object at 0x129e9d080>


Processing items: 100%|██████████| 1633/1633 [19:50<00:00,  1.37it/s]

Finished updating embeddings





In [5]:
from pymongo.operations import SearchIndexModel
from pymongo import MongoClient


# Connect to the MongoDB database
client = MongoClient("mongodb+srdb.net/?retryWrites=true&w=majority&appName=ItemSet")
db = client["Products"]
collection = db["DevClip"]

# Attempt to drop the existing index if it exists
try:
    collection.drop_index("image_embedding_vector_index")
    print("Index 'image_embedding_vector_index' dropped successfully.")
except Exception as e:
    if "IndexNotFound" in str(e):
        print("Index 'image_embedding_vector_index' does not exist. Proceeding to create it.")
    else:
        raise

# Define the index
index_definition = {
    "mappings": {
        "dynamic": True,
        "fields": {
            "image_embedding": {
                "type": "knnVector",
                "dimensions": 512,
                "similarity": "cosine"
            }
        }
    }
}

# Create the SearchIndexModel
index_name = "image_embedding_vector_index"
search_index_model = SearchIndexModel(index_definition, name=index_name)

# Create the index
result = collection.create_search_index(search_index_model)

print(f"Index '{index_name}' created successfully. Result: {result}")

NameError: name 'MongoClient' is not defined

In [60]:
import time

def get_embedding(input_data, input_type='image'):
    if input_type == 'image':
        if isinstance(input_data, str):  # If it's a URL
            response = requests.get(input_data)
            image = Image.open(BytesIO(response.content))
        else:  # If it's already a PIL Image
            image = input_data
        
        inputs = processor(images=image, return_tensors="pt")
        
        with torch.no_grad():
            features = model.get_image_features(**inputs)
    elif input_type == 'text':
        inputs = processor(text=input_data, return_tensors="pt", padding=True)
        print("inputs", inputs)

        with torch.no_grad():
            features = model.get_text_features(**inputs)

    
    else:
        raise ValueError("Invalid input_type. Must be 'image' or 'text'.")
    
    return features.squeeze().tolist()



text = "slutty dress"
text_embedding = get_embedding(text, input_type='text')


query_image_url = "https://www.lulus.com/images/product/xlarge/8401921_1714576.jpg?w=195&hdpi=1"
query_embedding = get_embedding(query_image_url, input_type='image')


pipeline = [
    {
        "$vectorSearch": {
            "index": "image_embedding_vector_index",
            "path": "image_embedding",
            "queryVector": text_embedding,
            "numCandidates": 100,
            "limit": 10
        }
    },
    {
        "$project": {
            "_id": 1,
            "item_id": 1,
            "item_url": 1,
            "product_brand": 1,
            "short_description": 1,
            "image_urls": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

start_time = time.time()
results = list(collection.aggregate(pipeline))
end_time = time.time()
print(f"MDB QUERY Time taken: {end_time - start_time:.4f} seconds")
results

inputs {'input_ids': tensor([[49406,  7224,  1856,  2595, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
MDB QUERY Time taken: 0.0708 seconds


[{'_id': ObjectId('64843818f0f526c7794ff873'),
  'item_id': 2494,
  'item_url': 'https://us.princesspolly.com/products/micah-mini-dress-blue-multi',
  'product_brand': 'Princess Polly',
  'short_description': 'Micah Mini Dress Blue Multi',
  'image_urls': ['https://stylimages.s3.amazonaws.com/2494_1.jpg',
   'https://stylimages.s3.amazonaws.com/2494_2.jpg',
   'https://stylimages.s3.amazonaws.com/2494_3.jpg',
   'https://stylimages.s3.amazonaws.com/2494_4.jpg',
   'https://stylimages.s3.amazonaws.com/2494_5.jpg',
   'https://stylimages.s3.amazonaws.com/2494_6.jpg',
   'https://stylimages.s3.amazonaws.com/2494_7.jpg',
   'https://stylimages.s3.amazonaws.com/2494_8.jpg',
   'https://stylimages.s3.amazonaws.com/2494_10.jpg'],
  'score': 0.6711717844009399},
 {'_id': ObjectId('6484381df0f526c7794ff8aa'),
  'item_id': 2551,
  'item_url': 'https://us.princesspolly.com/products/dale-mini-dress-black',
  'product_brand': 'Princess Polly',
  'short_description': 'Dale Mini Dress Black',
  'imag