In [1]:
import json
from tqdm.auto import tqdm
import pandas as pd
import minsearch
from qdrant_client import QdrantClient, models

with open('data/documents-with-ids.json') as f:
    docs = json.loads(f.read())

  from .autonotebook import tqdm as notebook_tqdm


## Search evaluation

In [2]:
df_ground_truth = pd.read_csv('data/ground-truth-data.csv')

In [3]:
ground_truth = df_ground_truth.to_dict(orient='records')
# ground_truth[:3]

In [23]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d.payload['id'] == doc_id for d in results.points]
        # relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"],
    
)

index.fit(docs)

<minsearch.minsearch.Index at 0x7d62d2f1ea80>

In [9]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
evaluate(ground_truth=ground_truth, search_function= lambda q: minsearch_search(query=q['query'], course=q['course']))

100%|██████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:13<00:00, 332.13it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887771}

In [4]:
from minsearch import VectorSearch

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [13]:
texts = []

for doc in docs:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [14]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, docs)

<minsearch.vector.VectorSearch at 0x7d629de1ab70>

In [16]:
def minsearch_vector_search(query, course):
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector= query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [17]:
evaluate(ground_truth=ground_truth, search_function= lambda q: minsearch_vector_search(query=q['query'], course=q['course']))

100%|██████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 659.25it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

In [4]:
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [19]:
#check whether collection already exists
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-rag-with-ids-hw"
try:
    qd_client.get_collection(collection_name=collection_name)
    print(f"Successfully imported collection: {collection_name}")
except: # create new collection
    print(f"Collection {collection_name} not found.")
    
    EMBEDDING_DIMENSIONALITY = 512

    # Create the collection with specified vector parameters
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
            distance=models.Distance.COSINE  # Distance metric for similarity search
        )
    )
    points = []
    id = 0
    
    for id,doc in enumerate(docs):
    
        point = models.PointStruct(
            id=id,
            vector=models.Document(text = doc['question'] + ' ' + doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
            payload={
                "text": doc['question'] + ' ' + doc['text'],
                "section": doc['section'],
                "course": doc['course'],
                "id": doc['id']
            } #save all needed metadata fields
        )
        points.append(point)
    qd_client.upsert(
        collection_name=collection_name,
        points=points
    )

Collection zoomcamp-rag-with-ids-hw not found.


Fetching 5 files: 100%|███████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.66s/it]


In [21]:
def search_in_course(query, course, limit=5):
    
    if course == None:
        course_list = ['machine-learning-zoomcamp', 'data-engineering-zoomcamp', 'mlops-zoomcamp']
    else:
        course_list = [course]
        
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchAny(any=course_list)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [24]:
evaluate(ground_truth=ground_truth, search_function= lambda q: search_in_course(query=q['query'], course=q['course']))

100%|███████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:09<00:00, 66.72it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

## RAG evaluation

In [1]:
import numpy as np
import pandas as pd

In [2]:
results_path = "data/results-gpt4o-mini-cosine.csv"
df_results = pd.read_csv(results_path)

In [3]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course,cosine
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.388594
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.29836
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.583048
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,-0.019481
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,0.358467


In [6]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [9]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [10]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [20]:
cosines = []
for row in df_results.iterrows():
    v_llm = pipeline.transform([row[1]['answer_llm']]).reshape(-1)
    v_orig = pipeline.transform([row[1]['answer_orig']]).reshape(-1)
    similarity = cosine(v_llm, v_orig)
    cosines.append(similarity)
    

In [30]:
cosines_np = np.array(cosines)
cosines_np.mean()

np.float64(0.8415841233490402)

In [24]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [25]:
df_results.iloc[10]

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
cosine                                                   0.68403
Name: 10, dtype: object

In [26]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [28]:
scores['rouge-1']['f']

0.45454544954545456

In [29]:
rouges = []
for row in df_results.iterrows():
    score_f1 = rouge_scorer.get_scores(row[1]['answer_llm'],row[1]['answer_orig'])[0]['rouge-1']['f']
    rouges.append(score_f1)

In [31]:
rouges_np = np.array(rouges)
rouges_np.mean()

np.float64(0.3516946452113943)