# Homework: Search Evaluation

We will use minsearch and Qdrant. Make sure you have the most up-to-date versions.

> minsearch should be at least 0.0.4

In [115]:
from minsearch import Index
from minsearch import VectorSearch
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm

In [140]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [141]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

In [142]:
df_ground_truth[:3]

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef


In [143]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

In [144]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

> text_fields=['question', 'text', 'section']
These are the fields that minsearch.Index will use to build its TF-IDF representations.

> keyword_fields=['course']
Documents&ground_truth have a course field

In [145]:
index = Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course'],
)

# Fit the index with your documents
index.fit(documents)

<minsearch.minsearch.Index at 0x12833b710>

In [146]:
def minsearch_text_search(q):
    boost = {'question': 1.5, 'section': 0.1}

    _query = q['question']
    _course = q['course']

    results = index.search(
        query=_query,
        filter_dict={'course': _course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [147]:
metrics_q1 = evaluate(ground_truth, minsearch_text_search)

100%|██████████| 4627/4627 [00:04<00:00, 969.27it/s]


In [148]:
print("Q1 Minsearch Text Search Evaluation Metrics:")
print(f"Hit Rate: {metrics_q1['hit_rate']:.4f}")
print(f"MRR: {metrics_q1['mrr']:.4f}")

Q1 Minsearch Text Search Evaluation Metrics:
Hit Rate: 0.8487
MRR: 0.7288


## Q2. Vector search for question

Instead of using minsearch.Index, we'll be using minsearch.VectorSearch and the pipeline to get embeddings.

In [162]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128)
)
X = pipeline.fit_transform(texts)

Index embeddings with minsearch.VectorSearch

In [163]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x128336f10>

In [164]:
def minsearch_vsearch(q):
    _query = q['question']
    _course = q['course']

    query_embedding = pipeline.transform([_query])[0]

    results = vindex.search(
        query_vector=query_embedding,
        filter_dict={'course': _course},
        num_results=5,
    )
    return results

In [165]:
metrics_q2 = evaluate(ground_truth, minsearch_vsearch)

100%|██████████| 4627/4627 [00:02<00:00, 1996.54it/s]


In [166]:
print("Q2 Minsearch Vector Search Evaluation Metrics:")
print(f"Hit Rate: {metrics_q2['hit_rate']:.4f}")
print(f"MRR: {metrics_q2['mrr']:.4f}")

Q2 Minsearch Vector Search Evaluation Metrics:
Hit Rate: 0.4815
MRR: 0.3578


## Q3. Vector search for question and answer

Create combined text for embeddings: "question" + " " + "text"

In [167]:
texts_combined = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_combined.append(t)

pipeline_q3 = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128)
)
X_combined = pipeline_q3.fit_transform(texts_combined)

vindex_q3 = VectorSearch(keyword_fields=['course'])
vindex_q3.fit(X_combined, documents)

<minsearch.vector.VectorSearch at 0x11fdb27d0>

In [168]:
def minsearch_vector_search_q3(q):
    _query = q['question']
    _course = q['course']

    query_embedding_q3 = pipeline_q3.transform([_query])[0]

    search_results = vindex_q3.search(
        query_vector=query_embedding_q3,
        filter_dict={'course': _course},
        num_results=5,
    )
    return search_results

In [170]:
metrics_q3 = evaluate(ground_truth, minsearch_vector_search_q3)

100%|██████████| 4627/4627 [00:03<00:00, 1485.64it/s]


In [171]:
print("Q3 Minsearch Vector Search (Question + Text) Evaluation Metrics:")
print(f"Hit Rate: {metrics_q3['hit_rate']:.4f}")
print(f"MRR: {metrics_q3['mrr']:.4f}")

Q3 Minsearch Vector Search (Question + Text) Evaluation Metrics:
Hit Rate: 0.8223
MRR: 0.6729
