In [1]:
pip install -U minsearch qdrant_client

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting grpcio>=1.41.0 (from qdrant_client)
  Downloading grpcio-1.73.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting protobuf>=3.20.0 (from qdrant_client)
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant_client)
  Downloading h2-4.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting hyperframe<7,>=6.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant_client)
  Downloading hyperframe-6.1.0-py3-none-any.whl.metadata (4.3 kB)
Collecting hpack<5,>=4.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant_client)
  Downloading hpack-4.1.0-py3-none-any.whl.metadata (4.6 kB)
D

In [31]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [32]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [33]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7cd3127cd730>

In [34]:
boost_params = {'question': 1.5, 'section': 0.1}

In [35]:
def minsearch_search(query, course, boost_params):
    boost = {'question': boost_params['question'], 'section': boost_params['section']}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [36]:
#example of question
ground_truth[0]['question']

'When does the course begin?'

In [37]:
minsearch_search(ground_truth[0]['question'], ground_truth[0]['course'], boost_params)

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'quest

In [38]:
#Q1
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course'], boost_params))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:12<00:00, 362.00it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [39]:
#Q2 Begin
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [40]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [189]:
X

array([[ 0.20189188, -0.19028114, -0.10261914, ...,  0.03719206,
         0.02850986, -0.04641277],
       [ 0.2723704 , -0.33653397, -0.1445361 , ..., -0.0499137 ,
         0.01132394,  0.02318573],
       [ 0.25137243, -0.24366293, -0.11105337, ...,  0.0322307 ,
        -0.02414921, -0.02599206],
       ...,
       [ 0.21850466,  0.2859507 ,  0.13110213, ...,  0.03990522,
        -0.02636175,  0.0350963 ],
       [ 0.01265053,  0.01110092, -0.02217507, ..., -0.02871288,
        -0.01579063, -0.08238173],
       [ 0.19543413, -0.03891868,  0.2853495 , ...,  0.11603444,
         0.03531262, -0.04113139]], shape=(948, 128))

In [41]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7cd312ff67e0>

In [42]:
def vector_search(document, boost_params):
    boost = {'question': boost_params['question'], 'section': boost_params['section']}

    question = document['question']
    course = document['course']
    vec = pipeline.transform([question])
    
    results = vindex.search(
        vec[0],
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [43]:
#example of question
ground_truth[0]['question']

'When does the course begin?'

In [44]:
vector_search(ground_truth[0], boost_params)

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'cours

In [194]:
#Q2 End
evaluate(ground_truth, lambda q: vector_search(q, boost_params))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:06<00:00, 696.62it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

In [16]:
#Q3 Begin
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X3 = pipeline.fit_transform(texts)

In [21]:
vindex3 = VectorSearch(keyword_fields={'course'})
vindex3.fit(X3, documents)

<minsearch.vector.VectorSearch at 0x72fe7748bcb0>

In [22]:
def vector_search3(document, boost_params):
    boost = {'question': boost_params['question'], 'section': boost_params['section']}

    question = document['question']
    course = document['course']
    vec = pipeline.transform([question])
    
    results = vindex3.search(
        vec[0],
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [23]:
#Q3 End
evaluate(ground_truth, lambda q: vector_search3(q, boost_params))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 621.94it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

In [119]:
#Q4 Begin
# Run a Qdrant instance in a Docker container before we start this 

In [120]:
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
qd_client = QdrantClient("http://localhost:6333")

In [225]:
collection_name = "homework-03-Q4"

qd_client.delete_collection(collection_name=collection_name)

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    )
)

True

In [226]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [227]:
points = []
id = 0

for i, doc in enumerate(documents):

    point = models.PointStruct(
        id=id,
        vector=models.Document(text = doc['question'] + ' ' + doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

In [228]:
# takes some time
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [240]:
question = ground_truth[2]['question']
question

'What is the link for course registration?'

In [237]:
def quadrant_search1(document, limit=5):
    
    question = document['question']
    course = document['course']
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(text=question, model=model_handle),
        limit=5,
        with_payload=True 
    )
    #return query_points
    results = [point.payload for point in query_points.points]
    return results

In [241]:
quadrant_result = quadrant_search1(ground_truth[2])
quadrant_result

[{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'section': 'General course-related questions',
  'course': 'data-engineering-zoomcamp',
  'id': '0bbf41ec'},
 {'text': 'Problem description\nProject structure:\n/sources/production/model_service.py\n/sources/tests/unit_tests/test_model_service.py (“from production.model_service import ModelService)\nWhen running python test_model_service.py from the sources directory, it works.\nWhen running pytest ./test/unit_tests fails. ‘No module named ‘production’’\nSolution description\nUse python -m pytest ./test/unit_tests\nExplanation: pytest does not add to the sys.path the path where pytest is run.\nYou can run python -m pytest, or alternatively export PYTHONPATH=. Before executing pytest\nAdded by MarcosMJD',
  'section': 'Module 6: Best practices',
  'cour

In [242]:
#Q4 End
evaluate(ground_truth, lambda q: quadrant_search1(q))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:55<00:00, 83.72it/s]


{'hit_rate': 0.03177004538577912, 'mrr': 0.012538721994092655}

In [15]:
#Q5 Begin
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [22]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


In [16]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)


In [17]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [19]:
texts = df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question
pipeline.fit(texts)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [23]:
v_llm = pipeline.transform(df_results['answer_llm'])
v_orig = pipeline.transform(df_results['answer_orig'])


In [29]:
import numpy as np
cosines = []
for u, v in zip(v_llm, v_orig):
    cosines.append(cosine(u, v))

In [30]:
#Q5 End
average_cosine = np.mean(cosines)
print("Average cosine similarity:", average_cosine)

Average cosine similarity: 0.8415841233490402


In [46]:
#Q6 Begin
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [47]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [63]:
rouge_1_f1_scores = []

for i in range(len(df_results)):
    r = df_results.iloc[i]
    scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    rouge_1_f1 = scores['rouge-1']['f']
    rouge_1_f1_scores.append(rouge_1_f1)

In [64]:
#Q6 End
average_rouge_1_f1 = np.mean(rouge_1_f1_scores)
print("Average ROUGE-1 F1:", average_rouge_1_f1)

Average ROUGE-1 F1: 0.3516946452113943
