In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")

In [None]:
data_folder = "data/"
filename = data_folder + "bfp-a3447q.pdf"
content_path= filename.split('.')[0]+'_v2_chunked.txt'

In [None]:
import pathlib, json
json_read = pathlib.Path(content_path).read_text()
data_content = json.loads(json_read)

## Step 2: Sparse vector search with BM25

We are going to use the same dataset as before. Let's download it and load into Qdrant, but this time we are going to create sparse vectors with BM25 only.

We need to create a collection first. Qdrant will handle the IDF calculations, if we configure it to. That's required for BM25, otherwise it won't boost the rare words.

In [None]:
collection_name="bfp-a3447q_sparse"

In [None]:
from qdrant_client import models

# Create the collection with specified sparse vector parameters
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF,
            )
        }
    )

FastEmbed comes with a BM25 implementation that we can use as any other model.

In [None]:
points = []
id = 0
title = 'RH-3CH-Sxx/RH-6CH-Sxx Special Specifications Manual' # can be obtained from doc metadata
for index, chapter in enumerate(data_content):
    # elements of data list:
        # 0 - chapter level
        # 1 - chapter name
        # 2 - page number (1-based)
        # 3 - chunk of text
    if chapter[0] == 1:
        root_chapter = chapter[1]
    if 3*len(chapter[1]) > len(chapter[-1]): 
        print(f'{index}) Paragraphs not generated for chapter: {chapter[1]}')
        continue    
    # if index not in data_context.keys(): # if context not created, skip embedding
    #     print(f'\nChapter "{chapter[1]}" skipped', end='')
    #     continue
    context = "" # data_context[index]
    text =  context + chapter[-1]
    # print(f'\n\tChapter "{chapter[1]}" ', end='')
    point = models.PointStruct(
        id=id,
        vector={
            "bm25": models.Document(
                text=text, 
                model="Qdrant/bm25",
            )
        },
        payload={
            "content": chapter[-1],
            "main_chapter": root_chapter,
            "chapter": chapter[1],
            "manual": title,
            "page": chapter[2]
        } #save all needed metadata fields
    )
    # print("encoded... ", end='')
    points.append(point)
    id += 1
print(f"Collection points gathered")
client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"Collection {collection_name} upserted.")


In [None]:
def search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [None]:
results = search("rcready", limit = 5)
results

In [None]:
results[0].score

### Step 3: Qdrant search for combined vectors

In [None]:
model_name = 'all-mpnet-base-v2'

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    model_name, 
    trust_remote_code=True,
    cache_folder="./models"   # explicitly setting cache location
)
emb_dimensions = model.get_sentence_embedding_dimension()

In [None]:
collection_name="bfp-a3447q_hybrid"

In [None]:
# Create the collection with both vector types
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config={
            model_name: models.VectorParams(
                size=emb_dimensions,  # Dimensionality of the vectors
                distance=models.Distance.COSINE  # Distance metric for similarity search
            ),
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF,
            )
        }
    )
    print(f"New collection {collection_name} created.")

We have to upload all the vectors into the newly created collection.

In [None]:
points = []
id = 0
title = 'RH-3CH-Sxx/RH-6CH-Sxx Special Specifications Manual' # can be obtained from doc metadata
for index, chapter in enumerate(data_content):
    # elements of data list:
        # 0 - chapter level
        # 1 - chapter name
        # 2 - page number (1-based)
        # 3 - chunk of text
    
    if chapter[0] == 1:
        root_chapter = chapter[1]
    if 3*len(chapter[1]) > len(chapter[-1]): 
        print(f'{index}) Paragraphs not generated for chapter: {chapter[1]}')
        continue    

    context = "" # data_context[index]
    text =  context + chapter[-1]
    
    point = models.PointStruct(
        id=id,
        vector={
            model_name: model.encode(text).tolist(),
            "bm25": models.Document(
                text=text, 
                model="Qdrant/bm25",
            ),
        },
        payload={
            "content": chapter[-1],
            "main_chapter": root_chapter,
            "chapter": chapter[1],
            "manual": title,
            "page": chapter[2]
        } #save all needed metadata fields
    )
    points.append(point)
    id += 1
    
print(f"Collection points gathered")
client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"Collection {collection_name} upserted.")

### Step 4: Qdrant Universal Query API - prefetching

In [None]:
def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=model.encode(query).tolist(),
                using=model_name,
                limit=(5 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [None]:
q="How much of space is required to place controller?"

In [None]:
# results = multi_stage_search(q, 5)
print(*results, sep="\n\n")

In [None]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=model.encode(query).tolist(),
                using=model_name,
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=limit,
        with_payload=True
    )

    return results.points

In [None]:
q = "which signal confirms that servo is working?"

In [None]:
results = rrf_search(q, limit= 5)
print(len(results), "results:")
print(*[payload for payload in results], sep="\n\n")