In [1]:
pip install -U minsearch qdrant_client rouge pandas scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [None]:
#Question 1. Hitrate for minsearch text (1 point)

In [58]:
from minsearch import Index

index = Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["id", "course"]
)
index.fit(documents)

def search_function(q):
    boost = {'question': 1.5, 'section': 0.1}
    
    return index.search(query=q['question'], 
                        filter_dict={"course": q['course']}, 
                        boost_dict=boost, 
                        num_results=5)

evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [None]:
#Question 2. MRR Vector search (question field) (1 point)

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from minsearch import VectorSearch

In [60]:
texts = [doc['question'] for doc in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)

In [61]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b26d2b9d0d0>

In [62]:
def search_function(q):
    q_vec = pipeline.transform([q['question']])
    return vindex.search(q_vec[0], 
                         filter_dict={"course": q['course']},
                         num_results=5)

In [63]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

In [None]:
#Question 3. Hitrate Vector search (question + text fields) (1 point)

In [64]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [66]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b26e88eda60>

In [67]:
def search_function(q):
    q_vec = pipeline.transform([q['question']])
    return vindex.search(q_vec[0], 
                         filter_dict={"course": q['course']},
                         num_results=5)

In [68]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

In [None]:
#Question 4. MRR Qdrant (1 point)

In [15]:
pip install qdrant-client sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [16]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

In [17]:
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")

texts = [doc["question"] + " " + doc["text"] for doc in documents]
vectors = [model.encode(text).tolist() for text in texts]

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

In [18]:
qdrant = QdrantClient(":memory:")

qdrant.recreate_collection(
    collection_name="faq",
    vectors_config=VectorParams(size=len(vectors[0]), distance=Distance.COSINE)
)

points = [
    PointStruct(id=i, vector=vectors[i], payload=documents[i])
    for i in range(len(documents))
]

qdrant.upsert(collection_name="faq", points=points)

  qdrant.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
def search_function(q):
    doc_id = q['document']
    doc = next(doc for doc in documents if doc['id'] == doc_id)
    query = q['question'] + ' ' + doc['text']
    query_vector = model.encode(query).tolist()
    hits = qdrant.search(collection_name="faq", query_vector=query_vector, limit=5)
    return [hit.payload for hit in hits]


In [22]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

  hits = qdrant.search(collection_name="faq", query_vector=query_vector, limit=5)


{'hit_rate': 0.9920034579641236, 'mrr': 0.9887652186441899}

In [None]:
#Question 5. Average cosine (1 point)

In [33]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [34]:
texts = [doc["question"] + " " + doc["text"] for doc in documents]
vectors = [model.encode(text).tolist() for text in texts]

In [35]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

qdrant = QdrantClient(":memory:")

if qdrant.collection_exists("faq"):
    qdrant.delete_collection("faq")

qdrant.create_collection(
    collection_name="faq",
    vectors_config=VectorParams(size=len(vectors[0]), distance=Distance.COSINE)
)

points = [
    PointStruct(id=i, vector=vectors[i], payload=documents[i])
    for i in range(len(documents))
]

qdrant.upsert(collection_name="faq", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [45]:
def search_function(q):
    doc_id = q['document']
    doc = next(doc for doc in documents if doc['id'] == doc_id)

    query = q['question'] + ' ' + doc['text']
    query_vector = model.encode(query).tolist()

    hits = qdrant.search(
        collection_name="faq",
        query_vector=query_vector,
        limit=5
    )
    return [hit.payload for hit in hits]

In [46]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

  hits = qdrant.search(


{'hit_rate': 0.9952452993300195, 'mrr': 0.988152870830632}

In [None]:
#Question 6. Average Rouge-1 F1 (1 point)

In [51]:
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [49]:
import numpy as np

def cosine(u, v):
    u_norm = np.linalg.norm(u)
    v_norm = np.linalg.norm(v)
    return u.dot(v) / (u_norm * v_norm)

In [50]:
similarities = []

for _, row in df_results.iterrows():
    v_llm = pipeline.transform([row.answer_llm])[0]
    v_orig = pipeline.transform([row.answer_orig])[0]
    sim = cosine(v_llm, v_orig)
    similarities.append(sim)

avg_cosine = np.mean(similarities)
print("Average cosine similarity:", avg_cosine)

Average cosine similarity: 0.8415841233490402


In [52]:
pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [53]:
from rouge import Rouge
rouge = Rouge()

In [54]:
f1_scores = []

for _, row in df_results.iterrows():
    scores = rouge.get_scores(row.answer_llm, row.answer_orig)[0]
    f1 = scores['rouge-1']['f']
    f1_scores.append(f1)

avg_f1 = np.mean(f1_scores)
print("Average ROUGE-1 F1:", avg_f1)

Average ROUGE-1 F1: 0.3516946452113943
