## Question 1

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [4]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x13cd07550>

In [5]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
text_fields=["question", "section", "text"],
keyword_fields=["course", "id"]

In [7]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [8]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [9]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Question 2

In [11]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x16c332cd0>

In [12]:
def search_function(q, k=5):
    x = pipeline.transform([q['question']])
    results = vindex.search(x)
    return results[:k]

In [13]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.3939917873352064, 'mrr': 0.29021324112095703}

## Question 3

In [15]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [16]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7704776312945754, 'mrr': 0.6157301347165195}

## Question 4

In [18]:
from qdrant_client import QdrantClient, models  
import uuid  

client = QdrantClient(":memory:")

model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5

In [19]:
client.create_collection(
    collection_name="homework_collection",
    vectors_config={
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    }
)

True

In [20]:
points = []
for doc in documents:
    text = doc['question'] + ' ' + doc['text']  # As specified in Q4
    
    points.append(
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=text,
                    model=model_handle,
                ),
            },
            payload=doc
        )
    )

In [21]:
client.upsert(
    collection_name="homework_collection",
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
def search_function(q, limit=limit):
    results = client.query_points(
        collection_name="homework_collection",
        query=models.Document(
            text=q['question'],
            model=model_handle,
        ),
        using="jina-small",
        limit=limit,
        with_payload=True,
    )
    
    return [result.payload for result in results.points]

In [23]:
evaluate(ground_truth, search_function)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9120380376053598, 'mrr': 0.8244542900367421}

## Question 5

In [25]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [26]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [27]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [28]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [29]:
v_llm = pipeline.transform(df_results.answer_llm)
v_orig = pipeline.transform(df_results.answer_orig)

In [30]:
cosine_similarities = []
for i in range(len(df_results)):
    u = v_llm[i]
    v = v_orig[i]
    
    cos_sim = cosine(u, v)
    cosine_similarities.append(cos_sim)

In [31]:
np.mean(cosine_similarities)

0.8415841233490402

## Question 6

In [63]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]

In [65]:
rouge_1_f1_scores = []

for i in range(len(df_results)):
    row = df_results.iloc[i]

    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    rouge_1_f1 = scores['rouge-1']['f']
    rouge_1_f1_scores.append(rouge_1_f1)

In [67]:
sum(rouge_1_f1_scores) / len(rouge_1_f1_scores)

0.3516946452113944