# Setup

## Install

In [2]:
!pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.16.2-py3-none-any.whl (377 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.16.2


In [11]:
!pip install tqdm



## Import

In [17]:
import os
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue, Range

## Qdrant Cloud

In [4]:
from google.colab import userdata
client = QdrantClient(url=userdata.get("Qdrant_endpoint"), api_key=userdata.get("Qdrant"))

# Creation

## Collection

In [10]:
df = pd.read_pickle('parasave_embedded.pkl')

In [13]:
# vector dimensions
dense_dim = len(df['dense_vector'].iloc[0])
sparse_dim = len(df['sparse_vector'].iloc[0])

print(f"Dense dimension: {dense_dim}")
print(f"Sparse dimension: {sparse_dim}")

#collection avec 2 vecteurs (hybrid)
client.create_collection(
    collection_name="wellness_products",
    vectors_config={
        "dense": VectorParams(
            size=dense_dim,
            distance=Distance.COSINE
        ),
        "sparse": VectorParams(
            size=sparse_dim,
            distance=Distance.COSINE
        )
    }
)

print("Collection created")

Dense dimension: 384
Sparse dimension: 2809
Collection created


## Points

In [14]:
#set
points = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preparing points"):
    point = PointStruct(
        id=idx,
        vector={
            "dense": row['dense_vector'],
            "sparse": row['sparse_vector']
        },
        payload={
            "product_name": row['product_name'],
            "product_brand": row['product_brand'],
            "price": float(row['price']),
            "promo": row.get('promo'),
            "category": row['category'],
            "ingredients": row['ingredients'],
            "description": row.get('description'),
            "url": row['url'],
            "scraping_date": (row['scraping_date'])
        }
    )
    points.append(point)

Preparing points: 100%|██████████| 557/557 [00:00<00:00, 731.86it/s]


In [15]:
#insert
batch_size = 100
for i in tqdm(range(0, len(points), batch_size), desc="Uploading to Qdrant"):
    batch = points[i:i+batch_size]
    client.upsert(
        collection_name="wellness_products",
        points=batch
    )

print(f"{len(points)} produits insérés dans Qdrant Cloud!")

Uploading to Qdrant: 100%|██████████| 6/6 [00:07<00:00,  1.23s/it]

557 produits insérés dans Qdrant Cloud!





### Payload indexes

In [16]:
client.create_payload_index(
    collection_name="wellness_products",
    field_name="category",
    field_schema="keyword"
)

client.create_payload_index(
    collection_name="wellness_products",
    field_name="price",
    field_schema="float"
)

"""client.create_payload_index(
    collection_name="wellness_products",
    field_name="product_brand",
    field_schema="keyword"
)

client.create_payload_index(
    collection_name="wellness_products",
    field_name="product_name",
    field_schema="keyword"
)"""

'client.create_payload_index(\n    collection_name="wellness_products",\n    field_name="product_brand",\n    field_schema="keyword"\n)\n\nclient.create_payload_index(\n    collection_name="wellness_products",\n    field_name="product_name",\n    field_schema="keyword"\n)'

### Remark : Fixing an error in the Category

In [19]:
# 1. Find and update all lowercase "solar" to "Solar"
client.set_payload(
    collection_name="wellness_products",
    payload={"category": "solar"},
    points=Filter(
        must=[
            FieldCondition(key="category", match=MatchValue(value="Solar"))
        ]
    )
)

UpdateResult(operation_id=12, status=<UpdateStatus.COMPLETED: 'completed'>)

#Filtering