<a href="https://colab.research.google.com/github/SARA3SAEED/abu-LLM/blob/main/4_abu_part_02_QDrant_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install qdrant-client==1.11.1 sentence-transformers==3.0.1 datasets pympler==1.1

## Download Dataset

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

ds = load_dataset("Cohere/wikipedia-22-12", "ar", streaming=True, trust_remote_code=True)

In [None]:
max_titles = 100
all_titles = set()
collected_recs = []

progress_bar = tqdm()
for rec in ds["train"]:
    if rec["title"] not in all_titles:
        all_titles.add(rec["title"])
        progress_bar.update(1)

    if len(all_titles) == max_titles:
        break

    collected_recs.append(rec)

0it [00:00, ?it/s]

In [None]:
collected_df = pd.DataFrame(collected_recs)
collected_df = collected_df.drop(columns=["url", "wiki_id", "views", "paragraph_id", "langs"])
print(collected_df.shape)

(8481, 3)


In [None]:
# ===== Collecte text length
collected_df["text_length"] = collected_df["text"].apply(lambda text: len(text.strip()))
cleaned_collected_df = collected_df[ collected_df["text_length"] <= 1500 ]
print(cleaned_collected_df.shape)

(8238, 4)


In [None]:
cleaned_collected_df = cleaned_collected_df.sample(frac=1, random_state=101)

## Setup Qdrant DB

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import os

qdrant_db_path = "./qdrant_db"
os.makedirs(qdrant_db_path, exist_ok=True)

qdrant_client = QdrantClient(path=qdrant_db_path)

In [None]:
# create new collection
collection_name = "ar_wiki_base"
vec_size = 384

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vec_size, distance=Distance.COSINE),
)

True

## Load Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer

# ===== Slower and High Memory Consumption
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device="cuda:0") # device="cpu"

normalize_embeddings = True

## Float32 | Push Data Into Qdrant Collection

In [None]:
batch_size = 50
total = cleaned_collected_df.shape[0]
rec_id = 0

for i in tqdm(range(0, total, batch_size)):
    batch_titles = cleaned_collected_df["title"].values[i:i+batch_size]
    batch_texts = cleaned_collected_df["text"].values[i:i+batch_size]
    batch_embeds = model.encode(batch_texts).tolist()

    points = []
    for title, text, embed in zip(batch_titles, batch_texts, batch_embeds):
        points.append(PointStruct(
                        id=rec_id,
                        vector=embed,
                        payload={"title": title, "text": text}
                    ))
        rec_id += 1

    qdrant_client.upsert(collection_name=collection_name, points=points)

In [None]:
# search
query_text = "متى أصدرت انستجرام تطبيقها المخصص لعملاء نظام تشغيل ميكروسوفت"
query_embed = model.encode(query_text).tolist()

hits = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embed,
    limit=5
)

## Binary | Push Data Into Qdrant Collection

In [None]:
from qdrant_client.models import BinaryQuantization, BinaryQuantizationConfig
from sentence_transformers.quantization import quantize_embeddings

# create new collection
collection_name = "ar_wiki_binary"
vec_size = 384

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vec_size, distance=Distance.COSINE),
    quantization_config=BinaryQuantization(
        binary=BinaryQuantizationConfig(
            always_ram=True,
        ),
    ),
)

# always_ram - whether to keep quantized vectors always cached in RAM or not.

# to delete a collection
# qdrant_client.delete_collection(collection_name=collection_name)

True

In [None]:
batch_embeds = model.encode(["test text"]) # HERE <---
binary_embeddings = quantize_embeddings(batch_embeds, precision="binary")

print(batch_embeds.shape)
print(binary_embeddings.shape)

(1, 384)
(1, 48)


In [None]:
batch_size = 50
total = cleaned_collected_df.shape[0]
rec_id = 0

for i in tqdm(range(0, total, batch_size)):
    batch_titles = cleaned_collected_df["title"].values[i:i+batch_size]
    batch_texts = cleaned_collected_df["text"].values[i:i+batch_size]
    batch_embeds = model.encode(batch_texts)

    points = []
    for title, text, embed in zip(batch_titles, batch_texts, batch_embeds):
        points.append(PointStruct(
                        id=rec_id,
                        vector=embed,
                        payload={"title": title, "text": text}
                    ))
        rec_id += 1

    qdrant_client.upsert(collection_name=collection_name, points=points)

  0%|          | 0/165 [00:00<?, ?it/s]

In [None]:
from qdrant_client.models import SearchParams, QuantizationSearchParams

# search
query_text = "متى أصدرت انستجرام تطبيقها المخصص لعملاء نظام تشغيل ميكروسوفت"
query_embed = model.encode(query_text)

hits = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embed,
    limit=5
)

## Scalar | Push Data Into Qdrant Collection

In [None]:
from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig, ScalarType
from sentence_transformers.quantization import quantize_embeddings

# create new collection
collection_name = "ar_wiki_scalar_int8"
vec_size = 384

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vec_size, distance=Distance.COSINE),
    quantization_config=ScalarQuantization(
        scalar=ScalarQuantizationConfig(
            type=ScalarType.INT8,
            quantile=0.99,
            always_ram=True,
        ),
    ),
)

# more here: https://qdrant.tech/documentation/guides/quantization/?q=binary#setting-up-scalar-quantization

True

In [None]:
from sentence_transformers.quantization import quantize_embeddings

# prepare callibration (always used for any next embeddings)
calibration_embeddings = model.encode(cleaned_collected_df["title"].values)

# scale a batch
batch_embeddings = model.encode(cleaned_collected_df["title"].values[:1000])

int8_embeddings = quantize_embeddings(
    batch_embeddings,
    precision="int8",
    calibration_embeddings=calibration_embeddings,
)

In [None]:
batch_size = 50
total = cleaned_collected_df.shape[0]
rec_id = 0

for i in tqdm(range(0, total, batch_size)):
    batch_titles = cleaned_collected_df["title"].values[i:i+batch_size]
    batch_texts = cleaned_collected_df["text"].values[i:i+batch_size]
    batch_embeds = model.encode(batch_texts)

    points = []
    for title, text, embed in zip(batch_titles, batch_texts, batch_embeds):
        points.append(PointStruct(
                        id=rec_id,
                        vector=embed,
                        payload={"title": title, "text": text}
                    ))
        rec_id += 1

    qdrant_client.upsert(collection_name=collection_name, points=points)

  0%|          | 0/165 [00:00<?, ?it/s]

In [None]:
from qdrant_client.models import SearchParams, QuantizationSearchParams

# search
query_text = "متى أصدرت انستجرام تطبيقها المخصص لعملاء نظام تشغيل ميكروسوفت"
query_embed = model.encode(query_text)

hits = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embed,
    limit=5,
    search_params=SearchParams(
        quantization=QuantizationSearchParams(
            rescore=True, # Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors.
            oversampling=2.0,
        )
    ),
)

# more here: https://qdrant.tech/documentation/guides/quantization/?q=binary#searching-with-quantization

## Re-Ranking

In [None]:
from transformers import AutoModelForSequenceClassification

reranker_model = AutoModelForSequenceClassification.from_pretrained(
    'jinaai/jina-reranker-v2-base-multilingual',
    torch_dtype="auto",
    trust_remote_code=True,
).to('cuda:0')

reranker_model.eval()

In [None]:
# Example query and documents
query = "منتجات ازالة البقع السوداء تحت العينين"
documents = [
    "Organic skincare for sensitive skin with aloe vera and chamomile.",
    "New makeup trends focus on bold colors and innovative techniques",
    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
    "Produits pour éliminer les taches brunes sous les yeux",
    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
    "针对敏感肌专门设计的天然有机护肤产品",
    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
    "敏感肌のために特別に設計された天然有機スキンケア製品",
    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
]

# construct sentence pairs
sentence_pairs = [[query, doc] for doc in documents]

scores = reranker_model.compute_score(sentence_pairs, max_length=1024)

In [None]:
import numpy as np

top_n = 5
top_indices = np.argsort(scores)[::-1][:top_n]

for idx in top_indices:
    score = scores[idx]
    print(f"Score: {score:.4f} | Document: {documents[idx]}")
    print("="*30)

Score: 0.7931 | Document: Produits pour éliminer les taches brunes sous les yeux
Score: 0.1755 | Document: Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras
Score: 0.1285 | Document: 新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています
Score: 0.0888 | Document: 敏感肌のために特別に設計された天然有機スキンケア製品
Score: 0.0851 | Document: 新的化妆趋势注重鲜艳的颜色和创新的技巧


## Re-Score Qdrant Results

In [None]:
collection_name = "ar_wiki_scalar_int8"
query_text = "متى أصدرت انستجرام تطبيقها المخصص لعملاء نظام تشغيل ميكروسوفت"
query_embed = model.encode(query_text)

hits = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embed,
    limit=5,
    search_params=SearchParams(
        quantization=QuantizationSearchParams(
            rescore=True, # Having the original vectors available, Qdrant can re-evaluate top-k search results using the original vectors.
            oversampling=2.0,
        )
    ),
)

In [None]:
hits_texts = [
    hit.payload["text"]
    for hit in hits
]

# construct sentence pairs
sentence_pairs = [[query_text, doc] for doc in hits_texts]

scores = reranker_model.compute_score(sentence_pairs, max_length=1024)

In [None]:
import numpy as np

top_n = 5
top_indices = np.argsort(scores)[::-1][:top_n]

for idx in top_indices:
    score = scores[idx]
    print(f"Score: {score:.4f} | Document: {hits_texts[idx]}")
    print("="*30)

Score: 0.6123 | Document: في أبريل 2016، أصدرت إنستغرام تطبيقها إلى ويندوز 10 موبايل، بعد سنوات من الطلب من قبل مايكروسوفت والعامة بأصدار تطبيق إلى المنصة. وفي 21 نوفمبر، 2013، أصدرت إنستغرام لأول مرة تطبيقها ليعمل على ويندوز فون 8، وسبق ذلك إطلاق نسخة بيتا تجريبية لإنستغرام على المنصة. وأضافت فيه الدعم للمقاطع المرئية (مشاهدة وإنشاء المنشورات أو القصص، ومشاهدة البثوث المباشرة)، وكذلك المنشورات والرسائل المباشرة. وفي أكتوبر 2016، تم إصدار تطبيق للحواسيب الشخصية والأجهزة اللوحية التي تعمل على ويندوز 10. وفي مايو 2016، حدثت إنستغرام موقعها الإلكتروني على المتصفح ليسمح للمستخدمين برفع الصور، وإضافة نسخة "lightweight" إلى نافذة الاستكشاف.
Score: 0.1859 | Document: في نوفمبر 2015، أعلنت إنستغرام أنها في 1 يونيو، 2016 ستبدأ بإنهاء وصول بيئة برمجة التطبيقات إلى المنصة وذلك لصيانة التحكم من أجل المجتمع وتوفير خارطة طريق واضحة للمطورين وإنشاء بيئة مبنية أكثر استمرارية تستند على تجارب أصلية غير مزيفة على المنصة، ويتضمن ذلك المحتوى الموجه بإتجاه ناشري المحتوى، والمُعلنين. أفيد بأن تلك التغييرات ك