In [None]:
import os
import json
import torch
import clip
from PIL import Image
from pymongo import MongoClient
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance, Filter
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import os 

In [None]:
# --------------------------
# 1. Load JSON Metadata
# --------------------------
cwd = os.getcwd()
# Build the JSON file path relative to the current working directory
json_file_path = os.path.join(cwd, "/Sample_Data/metadata.json")

with open(json_file_path, "r") as f:
    products = json.load(f)

In [19]:
print(products[0])

{'id': '1', 'SKU': 'SKU-SONY-WH1000XM5-BEIGE', 'name': 'Sony WH-1000XM5 Noise Cancelling Wireless Headphones (Beige)', 'brand': 'Sony', 'category': 'Electronics / Audio / Headphones', 'color': 'Beige', 'price': 399.99, 'description': 'Premium over-ear wireless headphones with industry-leading noise cancellation, up to 30 hours of battery life, and superior sound quality.', 'text': 'Sony WH-1000XM5 Noise Cancelling Wireless Headphones (Beige) by Sony are premium over-ear wireless headphones with industry-leading noise cancellation, offering up to 30 hours of battery life and superior sound quality. Available in Beige, they belong to the Electronics / Audio / Headphones category and are priced at $399.99.', 'image_path': '/home/vicky/Product_Matching_Pipeline/Dataset/Sony headphone- 4.jpg'}


In [20]:
# --------------------------
# 2. Setup Qdrant Client
# --------------------------
qdrant_client = QdrantClient(host="localhost", port=6333)
collection_visual = "products_visual"
collection_text = "products_text"

# create the collection with vector dimension 512 (for CLIP ViT-B/32)
qdrant_client.create_collection(
    collection_name=collection_visual,
    vectors_config= VectorParams(size=512, distance=Distance.COSINE)
)


True

In [21]:
# create the collection with vector dimension 512 (for CLIP ViT-B/32)
qdrant_client.create_collection(
    collection_name=collection_text,
    vectors_config= VectorParams(size=512, distance=Distance.COSINE)
)

True

In [23]:
# --------------------------
# 3. Setup MongoDB Client
# --------------------------
mongo_client = MongoClient("mongodb://localhost:27017/")
db = mongo_client["product_db"]
metadata_collection = db["product_metadata"]

In [24]:
mongo_client = MongoClient("mongodb://localhost:27017/")
db_log = mongo_client["log_db"]
log_collection = db["systems_log"]

In [25]:
# Initialize the CLIP model and processor from Transformers.
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# Ensure you have a device set up (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [26]:
# --------------------------
# 5. Process Products & Ingest Data
# --------------------------
for product in products:
    image_path = product.get("image_path")
    text_description = product.get("text")
    
    # Process visual embedding
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        continue

    try:
        # Use the processor to process the image; returns a dict with 'pixel_values'
        image_inputs = processor(images=image, return_tensors="pt")
        # Move inputs to the appropriate device.
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    with torch.no_grad():
        # Get visual features using the Transformers CLIP model.
        visual_embedding = model.get_image_features(**image_inputs).cpu().numpy().flatten()
        print(f"Visual embedding vectors: {visual_embedding.tolist()}")

    # Process textual embedding
    try:
        # Use the processor to tokenize the text description.
        text_inputs = processor(text=text_description, return_tensors="pt", padding=True, truncation=True)
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
    except Exception as e:
        print(f"Error processing text for product {product.get('id')}: {e}")
        continue

    with torch.no_grad():
        # Get text features from the CLIP model.
        text_embedding = model.get_text_features(**text_inputs).cpu().numpy().flatten()
        print(f"Text embedding vectors: {text_embedding.tolist()}")

    # Insert metadata into MongoDB (assuming metadata_collection is defined)
    metadata_collection.insert_one(product)
    
    # Prepare the payload for Qdrant.
    payload = {
        "id": product["id"],
        "SKU": product["SKU"],
        "name": product["name"],
        "brand": product["brand"],
    }
    
    # Create point structures for visual and text embeddings.
    # Assuming PointStruct is a class that wraps the data for Qdrant.
    point_visual = PointStruct(
        id=int(product["id"]) if product["id"].isdigit() else product["id"],
        vector=visual_embedding.tolist(),
        payload=payload
    )

    point_text = PointStruct(
        id=int(product["id"]) if product["id"].isdigit() else product["id"],
        vector=text_embedding.tolist(),
        payload=payload
    )
    
    # Insert points into Qdrant (assuming qdrant_client.upsert is defined).
    qdrant_client.upsert(collection_name=collection_visual, points=[point_visual])
    qdrant_client.upsert(collection_name=collection_text, points=[point_text])
    print(f"Processed product {product['id']} - {product['name']}")

Visual embedding vectors: [-0.3755730986595154, -0.00012572109699249268, 0.36178770661354065, 0.5483094453811646, -0.02203775756061077, -0.32045307755470276, -0.006537679582834244, 0.2543994188308716, 0.2582471966743469, 0.014897525310516357, 0.008283719420433044, 0.08574330806732178, 0.20732243359088898, -0.5783698558807373, 0.2004951536655426, -0.4761693477630615, 0.6648873686790466, -0.1906675398349762, -0.22794486582279205, -0.30998507142066956, -0.21231499314308167, -0.26396530866622925, -0.03553009033203125, -0.25307369232177734, -0.2699938118457794, 0.701772928237915, -0.037737831473350525, -0.15923663973808289, 0.07145635038614273, 0.023796159774065018, -0.1885172426700592, -0.6192527413368225, 0.0982569009065628, -0.1648128628730774, -0.11565691232681274, -0.259182333946228, 0.058373238891363144, -0.3071926534175873, 0.12017984688282013, 0.746870756149292, -0.05518338084220886, 0.022430211305618286, 0.5856198668479919, -0.23600661754608154, 0.5301784873008728, -1.1132715940475

In [27]:
print("Data ingestion into Qdrant and MongoDB is complete.")

Data ingestion into Qdrant and MongoDB is complete.


In [None]:
def delete_mongodb_data():
    try:
        mongo_client = MongoClient("mongodb://localhost:27017/")
        db = mongo_client["product_db"]  # Replace with your database name
        collection_name = "product_metadata"  # Replace with your collection name
        collection = db[collection_name]
        
        # Delete all documents in the collection
        result = collection.delete_many({})
        print(f"Deleted {result.deleted_count} documents from MongoDB collection '{collection_name}'.")
        
        # Alternatively, if you wish to drop the entire collection:
        # collection.drop()
        # print(f"Dropped MongoDB collection '{collection_name}'.")
    except Exception as e:
        print("Error deleting MongoDB data:", e)

def delete_qdrant_data(collection_name):
    try:
        qdrant_client = QdrantClient(host="localhost", port=6333)
        #collection_name = "products"  # Change to your collection name

        # Approach 1: Delete all points using a match-all filter
        # Uncomment the following lines if you prefer to clear the collection without deleting it.
        # qdrant_client.delete(
        #     collection_name=collection_name,
        #     points_selector={"filter": Filter(match={})}
        # )
        # print(f"Deleted all points from Qdrant collection '{collection_name}'.")

        # Approach 2: Delete the entire collection
        qdrant_client.delete_collection(collection_name=collection_name)
        print(f"Deleted Qdrant collection '{collection_name}'.")
    except Exception as e:
        print("Error deleting Qdrant data:", e)