In [1]:
import json
from tqdm.auto import tqdm

with open('data/documents-with-ids.json') as f:
    docs = json.loads(f.read())

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from qdrant_client import QdrantClient, models
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [16]:
# Define the collection name
collection_name = "zoomcamp-rag-with-ids"
EMBEDDING_DIMENSIONALITY = 512

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [19]:
points = []
id = 0

for doc in docs:

    point = models.PointStruct(
        id=id,
        vector=models.Document(text=doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

In [20]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
qd_client.get_collection(collection_name= collection_name)

In [9]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
def search_in_course(query, course, limit=3):
    
    if course == None:
        course_list = ['machine-learning-zoomcamp', 'data-engineering-zoomcamp', 'mlops-zoomcamp']
    else:
        course_list = [course]
        
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchAny(any=course_list)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [2]:
import pandas as pd

In [3]:
df_ground_truth = pd.read_csv('data/ground-truth-data.csv')

In [4]:
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth

[{'query': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'query': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'query': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'query': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'query': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'query': 'Where can I find the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'query': 'How do I check the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'query': 'Where are the course prerequisites listed?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'query': 'What ar

In [35]:
relevance_table = []
for query in tqdm(ground_truth):
    doc_id = query['document']
    results = search_in_course(query['query'], None, limit=5)
    relevance = [d.payload['id'] == doc_id for d in results.points]
    relevance_table.append(relevance)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:11<00:00, 64.31it/s]


In [36]:
relevance_table[:10]

[[False, False, False, False, False],
 [False, True, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False]]

In [37]:
example = [
    [False, False, False, False, False],# 0
    [False, True, False, False, False], # 1
    [False, False, True, False, False], # 1
    [False, False, False, False, False],# 0
    [False, False, False, True, False], # 1
    [False, False, False, False, False],# 0
    [False, False, False, False, False],# 0
    [False, False, False, False, False],# 0
    [False, False, False, False, False],# 0
    [False, False, False, False, False] # 0
]

Evaluation metrics:
- hit-rate (recall)
- mrr 

In [5]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [6]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [40]:
hit_rate(relevance_table)

0.6695483034363519

In [41]:
mrr(relevance_table)

0.5689251494849072

In [7]:
import minsearch

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(docs)

<minsearch.minsearch.Index at 0x7cee4a6a3230>

In [9]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [23]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['query'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 326.28it/s]


In [25]:
(hit_rate(relevance_total), mrr(relevance_total))

(0.7722066133563864, 0.6614545061594991)