In [None]:
#!pip install -U minsearch qdrant_client

In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Q1. Minsearch text
### Answer: HR = 0.8487

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x76ffb440c1d0>

In [4]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [5]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

# Q2. Vector search for question
### Answer: MRR = 0.3571

In [6]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [7]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [8]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x76ffb389e3c0>

In [9]:
def minsearch_search_TF_IDF(query, course):
    #boost = {'question': 1.5, 'section': 0.1}
    query_embedding = pipeline.transform([query])  # Эмбеддинг запроса
    
    results = vindex.search(
        query_embedding,
        filter_dict={'course': course},
        #boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
evaluate(ground_truth, lambda q: minsearch_search_TF_IDF(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

# Q3. Vector search for question and answer
### Answer: HR=0.8210

In [11]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x76ffb389e3c0>

In [12]:
evaluate(ground_truth, lambda q: minsearch_search_TF_IDF(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

# Q4. Qdrant
### Answer: MRR=0.85

In [13]:
#!pip install --upgrade fastembed
#!pip install --upgrade huggingface_hub

'''
pip install -q "qdrant-client[fastembed]>=1.14.2"

docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
'''

'\npip install -q "qdrant-client[fastembed]>=1.14.2"\n\ndocker pull qdrant/qdrant\n\ndocker run -p 6333:6333 -p 6334:6334    -v "$(pwd)/qdrant_storage:/qdrant/storage:z"    qdrant/qdrant\n'

In [14]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models
import json

In [15]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [16]:
for model in TextEmbedding.list_supported_models():
    if model["model"] == model_handle:
        print(json.dumps(model, indent=2))

{
  "model": "jinaai/jina-embeddings-v2-small-en",
  "sources": {
    "hf": "xenova/jina-embeddings-v2-small-en",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "onnx/model.onnx",
  "description": "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year.",
  "license": "apache-2.0",
  "size_in_GB": 0.12,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}


In [17]:
EMBEDDING_DIMENSIONALITY = 512

In [19]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [22]:
# Define the collection name
collection_name = "search_eval"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `search_eval` already exists!"},"time":0.000046781}'

In [20]:
points = []
id = 0

for doc in tqdm(documents):
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=doc['question'] + ' ' + doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "question": doc['question'],
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id" : doc['id']
        } #save all needed metadata fields
    )
    points.append(point)

    id +=1

  0%|          | 0/948 [00:00<?, ?it/s]

In [23]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [24]:
def qdrant_search(query, course_filter=None, limit=5):
    # Define the filter (if a course is specified)
    query_filter = None
    if course_filter:
        query_filter = models.Filter(
            must=[  # Must match this condition
                models.FieldCondition(
                    key="course",  # The metadata field to filter on
                    match=models.MatchValue(value=course_filter)  # Exact match
                )
            ]
        )

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle  # Your embedding model
        ),
        query_filter=query_filter,  # Apply the filter here
        limit=limit,
        with_payload=True  # Include metadata in results
    )
    
    return results

In [25]:
def evaluate_qdrant(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d.payload['id'] == doc_id for d in results.points]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
evaluate_qdrant(ground_truth, lambda q: qdrant_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

# Q5. Cosine simiarity

### Answer: cosine = 0.8416

In [27]:
import numpy as np

def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

In [28]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [29]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [30]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [31]:
# Calculate embeddings and cosine similarities
cosine_similarities = []
for _, row in df_results.iterrows():
    v_llm = pipeline.transform([row['answer_llm']]).flatten()
    v_orig = pipeline.transform([row['answer_orig']]).flatten()
    similarity = cosine(v_llm, v_orig)
    cosine_similarities.append(similarity)

# Calculate the average cosine similarity
average_cosine = np.mean(cosine_similarities)
print(f"Average cosine similarity: {average_cosine:.4f}")

Average cosine similarity: 0.8416


# Q6. Rouge
### This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.
### Answer: aver rouge-1 f1 = 0.3517

In [32]:
#!pip install rouge

In [33]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

#here are three scores: rouge-1, rouge-2 and rouge-l, and precision, recall and F1 score for each.
#rouge-1 - the overlap of unigrams,
#rouge-2 - bigrams,
#rouge-l - the longest common subsequence

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [34]:
rouge_1_f1 = []

for _, row in tqdm(df_results.iterrows()):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1_f1.append(scores['rouge-1']['f'])

# Calculate the average rouge-1 f1
average_rouge1_f1 = np.mean(rouge_1_f1)
print(f"Average rouge-1 f1 score: {average_rouge1_f1:.4f}")

0it [00:00, ?it/s]

Average rouge-1 f1 score: 0.3517
