In [1]:
!pip install -U minsearch qdrant_client

Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Downloading qdrant_client-1.15.0-py3-none-any.whl (337 kB)
   ---------------------------------------- 0.0/337.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/337.3 kB ? eta -:--:--
   --- ----------------------------------- 30.7/337.3 kB 262.6 kB/s eta 0:00:02
   --- ----------------------------------- 30.7/337.3 kB 262.6 kB/s eta 0:00:02
   ------- ------------------------------- 61.4/337.3 kB 328.2 kB/s eta 0:00:01
   ---------- ---------------------------- 92.2/337.3 kB 403.5 kB/s eta 0:00:01
   ------------- ------------------------ 122.9/337.3 kB 450.6 kB/s eta 0:00:01
   ---------------- --------------------- 143.4/337.3 kB 502.3 kB/s eta 0:00:01
   -------------------- ----------------- 184.3/337.3 kB 530.7 kB/s eta 0:00:01
   ----------------------------- -------- 266.2/337.3 kB 682.7 kB/s eta 0:00:01
   ------------------------------------ - 327.7/337.3 kB 726.4 k


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Q1

In [7]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [8]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Boosting parameters
boost = {'question': 1.5, 'section': 0.1}

# Helper function to apply boost as repeated text
def weighted_text(text, weight):
    repetitions = int(weight * 10)
    return (text + ' ') * repetitions

# Prepare documents text using boost
corpus = []
ids = []
for doc in documents:
    text = (
        weighted_text(doc['question'], boost['question']) +
        weighted_text(doc['section'], boost['section'])
    )
    corpus.append(text.strip())
    ids.append(doc['id'])

# Fit TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Define search function
def search_function(q, k=5):
    query = weighted_text(q['question'], boost['question']).strip()

    q_vector = vectorizer.transform([query])
    similarities = cosine_similarity(q_vector, X)[0]
    top_k = similarities.argsort()[-k:][::-1]

    results = [{'id': ids[i], 'score': similarities[i]} for i in top_k]
    return results


In [14]:
results = evaluate(ground_truth, search_function)
print(results)


100%|██████████| 4627/4627 [00:14<00:00, 309.99it/s]

{'hit_rate': 0.6174627188242922, 'mrr': 0.5233232476046396}





Q2

In [44]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1e4b1684860>

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from minsearch import VectorSearch

# Step 1: Prepare text (only questions)
texts = [doc['question'] for doc in documents]

# Step 2: Create embedding pipeline (TF-IDF + SVD)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Step 3: Fit and transform question embeddings
X = pipeline.fit_transform(texts)

# Step 4: Create and fit minsearch vector index
vindex = VectorSearch(keyword_fields={'course'})  # keyword_fields can be anything, not used in vector search here
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x1e4992c6a80>

In [46]:
def search_function(q, k=5):
    q_text = q['question']
    q_emb = pipeline.transform([q_text])
    results = vindex.search(q_emb[0], filter_dict=None, num_results=k)
    return results


In [47]:
results = evaluate(ground_truth, search_function)
print("Vector search (question only):", results)


100%|██████████| 4627/4627 [00:18<00:00, 243.60it/s]


Vector search (question only): {'hit_rate': 0.3939917873352064, 'mrr': 0.2898890569843674}


Q3

In [48]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]


In [49]:


pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)


In [50]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1e4ac643c20>

In [51]:
def search_function(q, k=5):
    q_text = q['question']
    q_emb = pipeline.transform([q_text])
    results = vindex.search(q_emb[0], filter_dict=None, num_results=k)
    return results


In [52]:
results = evaluate(ground_truth, search_function)
print("Q3 - Vector search using question + text:", results)


100%|██████████| 4627/4627 [00:31<00:00, 146.15it/s]

Q3 - Vector search using question + text: {'hit_rate': 0.7704776312945754, 'mrr': 0.6150097255240982}





Q4

In [55]:
pip install sentence-transformers


^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct

# Step 1: Prepare text
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
ids = [doc['id'] for doc in documents]

# Step 2: Load model and embed
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")
vectors = model.encode(texts)

# Step 3: Setup Qdrant (in-memory)
client = QdrantClient(":memory:")
client.recreate_collection(
    collection_name="faq_docs",
    vectors_config=VectorParams(size=len(vectors[0]), distance=Distance.COSINE),
)

# Step 4: Insert into Qdrant
client.upsert(
    collection_name="faq_docs",
    points=[
        PointStruct(id=i, vector=vector, payload={"id": ids[i]})
        for i, vector in enumerate(vectors)
    ],
)

# Step 5: Define search function
def search_function(q, k=5):
    q_vec = model.encode(q['question'])
    result = client.search(
        collection_name="faq_docs",
        query_vector=q_vec,
        limit=k,
    )
    return [{"id": r.payload["id"], "score": r.score} for r in result]

# Step 6: Evaluate
results = evaluate(ground_truth, search_function)
print(results)


Q5

In [39]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Load data
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)


In [40]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)


In [41]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Fit on the concatenated text corpus
pipeline.fit(
    df_results['answer_llm'] + ' ' + df_results['answer_orig'] + ' ' + df_results['question']
)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [42]:
X_llm = pipeline.transform(df_results['answer_llm'])
X_orig = pipeline.transform(df_results['answer_orig'])


In [43]:
cosines = [cosine(u, v) for u, v in zip(X_llm, X_orig)]
avg_cosine = np.mean(cosines)
print("Average cosine similarity:", avg_cosine)


Average cosine similarity: 0.8415841233490402


Q6

In [35]:
!pip install rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
import pandas as pd
from rouge import Rouge

# Load results from GPT-4o-mini
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)


In [37]:
rouge_scorer = Rouge()

# Store rouge-1 F1 scores
rouge_1_f1_scores = []

for _, row in df_results.iterrows():
    try:
        scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
        rouge_1_f1_scores.append(scores['rouge-1']['f'])
    except:
        # In case of empty or invalid strings, skip
        rouge_1_f1_scores.append(0.0)


In [38]:
avg_rouge_1_f1 = sum(rouge_1_f1_scores) / len(rouge_1_f1_scores)
print("Average ROUGE-1 F1 Score:", round(avg_rouge_1_f1, 2))


Average ROUGE-1 F1 Score: 0.35
