# Homework: Search Evaluation

We will use minsearch and Qdrant. Make sure you have the most up-to-date versions.

> minsearch should be at least 0.0.4

In [205]:
import uuid

from minsearch import Index
from minsearch import VectorSearch
import numpy as np
import pandas as pd
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding
import requests
from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm

In [140]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [141]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

In [142]:
df_ground_truth[:3]

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef


In [143]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

In [144]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

> text_fields=['question', 'text', 'section']
These are the fields that minsearch.Index will use to build its TF-IDF representations.

> keyword_fields=['course']
Documents&ground_truth have a course field

In [145]:
index = Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course'],
)

# Fit the index with your documents
index.fit(documents)

<minsearch.minsearch.Index at 0x12833b710>

In [146]:
def minsearch_text_search(q):
    boost = {'question': 1.5, 'section': 0.1}

    _query = q['question']
    _course = q['course']

    results = index.search(
        query=_query,
        filter_dict={'course': _course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [147]:
metrics_q1 = evaluate(ground_truth, minsearch_text_search)

100%|██████████| 4627/4627 [00:04<00:00, 969.27it/s]


In [148]:
print("Q1 Minsearch Text Search Evaluation Metrics:")
print(f"Hit Rate: {metrics_q1['hit_rate']:.4f}")
print(f"MRR: {metrics_q1['mrr']:.4f}")

Q1 Minsearch Text Search Evaluation Metrics:
Hit Rate: 0.8487
MRR: 0.7288


## Q2. Vector search for question

Instead of using minsearch.Index, we'll be using minsearch.VectorSearch and the pipeline to get embeddings.

In [162]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128)
)
X = pipeline.fit_transform(texts)

Index embeddings with minsearch.VectorSearch

In [163]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x128336f10>

In [164]:
def minsearch_vsearch(q):
    _query = q['question']
    _course = q['course']

    query_embedding = pipeline.transform([_query])[0]

    results = vindex.search(
        query_vector=query_embedding,
        filter_dict={'course': _course},
        num_results=5,
    )
    return results

In [165]:
metrics_q2 = evaluate(ground_truth, minsearch_vsearch)

100%|██████████| 4627/4627 [00:02<00:00, 1996.54it/s]


In [166]:
print("Q2 Minsearch Vector Search Evaluation Metrics:")
print(f"Hit Rate: {metrics_q2['hit_rate']:.4f}")
print(f"MRR: {metrics_q2['mrr']:.4f}")

Q2 Minsearch Vector Search Evaluation Metrics:
Hit Rate: 0.4815
MRR: 0.3578


## Q3. Vector search for question and answer

Create combined text for embeddings: "question" + " " + "text"

In [167]:
texts_combined = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_combined.append(t)

pipeline_q3 = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128)
)
X_combined = pipeline_q3.fit_transform(texts_combined)

vindex_q3 = VectorSearch(keyword_fields=['course'])
vindex_q3.fit(X_combined, documents)

<minsearch.vector.VectorSearch at 0x11fdb27d0>

In [168]:
def minsearch_vector_search_q3(q):
    _query = q['question']
    _course = q['course']

    query_embedding_q3 = pipeline_q3.transform([_query])[0]

    search_results = vindex_q3.search(
        query_vector=query_embedding_q3,
        filter_dict={'course': _course},
        num_results=5,
    )
    return search_results

In [193]:
metrics_q3 = evaluate(ground_truth, minsearch_vector_search_q3)

100%|██████████| 4627/4627 [00:04<00:00, 1100.46it/s]


In [171]:
print("Q3 Minsearch Vector Search (Question + Text) Evaluation Metrics:")
print(f"Hit Rate: {metrics_q3['hit_rate']:.4f}")
print(f"MRR: {metrics_q3['mrr']:.4f}")

Q3 Minsearch Vector Search (Question + Text) Evaluation Metrics:
Hit Rate: 0.8223
MRR: 0.6729


# Q4. Qdrant Evaluation



In [176]:
EMBEDDING_DIMENSIONALITY = 512
collection_name = 'zoomcamp_documents_eval'
model = "jinaai/jina-embeddings-v2-small-en"
embedding_model = TextEmbedding(model_name=model)
limit = 5

client = QdrantClient("http://localhost:6333")

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [183]:
embedding_dimension = embedding_model.get_embedding_size(model)
print(f"Embedding dimension: {embedding_dimension}")

Embedding dimension: 512


In [188]:
points = []
texts_to_embed = []
doc_mapping = []

for doc in documents:
    doc_text_for_embedding = doc.get('question', '') + ' ' + doc.get('text', '')
    texts_to_embed.append(doc_text_for_embedding)
    doc_mapping.append(doc)

document_embeddings = list(embedding_model.embed(texts_to_embed))

for i, embedding in enumerate(tqdm(document_embeddings, desc="Preparing points")):
    doc = doc_mapping[i]
    payload = {k: v for k, v in doc.items()}

    qdrant_point_id = str(uuid.uuid4())
    # Store the original document ID in the payload for later retrieval/matching
    # It's already there because payload = {k: v for k, v in doc.items()} includes 'id'
    # but explicitly ensure it's correct for clarity.
    # original_doc_id = doc['id'] # This is the 'c02e79ef' type ID
    # payload['original_id'] = original_doc_id # You could rename it if 'id' in payload is confusing

    points.append(
        models.PointStruct(
            id=qdrant_point_id,
            vector=embedding.tolist(),
            payload=payload
        )
    )

Preparing points: 100%|██████████| 948/948 [00:00<00:00, 19748.78it/s]


In [189]:
client.upsert(
    collection_name=collection_name,
    wait=True,
    points=points
)
print(f"Uploaded {len(points)} documents to Qdrant.")

Uploaded 948 documents to Qdrant.


In [190]:
def qdrant_search_q4_fastembed(q):
    query_text = q['question']
    target_course = q['course']

    query_embedding = list(embedding_model.embed([query_text]))[0].tolist()

    course_filter = models.Filter(
        must=[
            models.FieldCondition(
                key="course",
                match=models.MatchValue(value=target_course)
            )
        ]
    )

    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=course_filter,
        limit=limit,
        with_payload=True # We retrieve the payload which contains the original 'id'
    )

    results = []
    for hit in search_result:
        # The .payload should contain the original 'id' field of the document
        results.append(hit.payload)

    return results

In [192]:
metrics_q4 = evaluate(ground_truth, qdrant_search_q4_fastembed)

  search_result = client.search(
100%|██████████| 4627/4627 [00:29<00:00, 159.44it/s]


In [195]:
print("Q4 Qdrant Vector Search (Question + Text) Evaluation Metrics:")
print(f"Hit Rate: {metrics_q4['hit_rate']:.4f}")
print(f"MRR: {metrics_q4['mrr']:.4f}")

Q4 Qdrant Vector Search (Question + Text) Evaluation Metrics:
Hit Rate: 0.9300
MRR: 0.8518


## Q5. Cosine simiarity

In [198]:
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [199]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [200]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [202]:
cosine_similarities = []

for index, row in df_results.iterrows():
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']

    # Transform these texts into embeddings using the fitted pipeline
    # The transform method expects an iterable, so pass as [text].
    # It returns a 2D array, so take [0] to get the 1D embedding vector.
    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]

    # Compute the cosine similarity between the two embeddings
    similarity = cosine(v_llm, v_orig)
    cosine_similarities.append(similarity)

In [203]:
average_cosine_similarity = np.mean(cosine_similarities)

print(f"\nAverage Cosine Similarity: {average_cosine_similarity:.4f}")


Average Cosine Similarity: 0.8416


## Q6. Rouge
> rouge==1.0.1 works with minsearch==0.0.2

In [210]:
r_10 = df_results.iloc[10]
r_10

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [211]:
rouge_scorer = Rouge()

scores_10 = rouge_scorer.get_scores(r_10.answer_llm, r_10.answer_orig)[0]
print(f"ROUGE scores for document at index 10:\n{scores_10}")
print(f"ROUGE-1 F1 score for document at index 10: {scores_10['rouge-1']['f']:.2f}")

ROUGE scores for document at index 10:
{'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}
ROUGE-1 F1 score for document at index 10: 0.45


Compute ROUGE-1 F1 for all pairs in the dataframe

In [212]:
rouge1_f1_scores = []

print("\nCalculating ROUGE-1 F1 scores for all pairs...")
for index, row in tqdm(df_results.iterrows(), total=len(df_results), desc="Processing rows"):
    try:
        scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
        rouge1_f1 = scores['rouge-1']['f']
        rouge1_f1_scores.append(rouge1_f1)
    except ValueError as e:
        rouge1_f1_scores.append(0.0)




Calculating ROUGE-1 F1 scores for all pairs...


Processing rows: 100%|██████████| 1830/1830 [00:02<00:00, 681.01it/s]


In [213]:
average_rouge1_f1 = np.mean(rouge1_f1_scores)

print(f"\nAverage ROUGE-1 F1 Score: {average_rouge1_f1:.4f}")


Average ROUGE-1 F1 Score: 0.3517
