# Homework: Vector Search

In [37]:
from sentence_transformers import SentenceTransformer
import requests 
from tqdm.auto import tqdm
import json
import numpy as np
import pandas as pd

In [None]:
user_question = "I just discovered the course. Can I still join it?"
course_name = 'machine-learning-zoomcamp'

In [38]:
# Load the embedding model
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

# Q1. Getting the embeddings model <br>

In [None]:
# Create the embedding
# the query vector (v)
v = embedding_model.encode(user_question)

In [None]:
# Get the first value of the resulting vector
v[0]

Q1-A. 0.07

# Prepare the documents


In [None]:
# Load the documents with ids
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Q2. Creating the embeddings <br>



In [None]:
documents

In [None]:
# Filter the documents for "machine-learning-zoomcamp"
filtered_documents = []

for doc in documents:
    if doc['course'] == course_name:
        filtered_documents.append(doc)

In [None]:
len(filtered_documents)

In [None]:
embeddings = []

for doc in tqdm(filtered_documents):
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

In [None]:
# Define a matrix X
# Convert the list of embeddings to a NumPy array
X = np.array(embeddings)

In [None]:
X.shape

Q2-A: (375, 768)

# Q3. Search

In [None]:
v.dot(v)

In [None]:
# Calculate the dot product of v with itself
dot_product_v = v.dot(v)
print (dot_product_v)

In [None]:
scores = X.dot(v)


In [None]:
highest_score = np.max(scores)
print(highest_score)

Q3-A: 0.65

# Vector search

idx = np.argpartition(-scores)[:num_results]<br><br>
函數 np.argpartition 用於對陣列進行部分排序。與提供完整排序順序的 np.argsort 不同，np.argpartition 確保傳回索引處的元素是最小的（如果對陣列取反則為最大），但不保證這些元素的順序。<br><br>
Negating Scores:-scores 否定餘弦相似度分數，將問題從尋找最大值改為尋找最小值（因為最小的負值是最大的正值）。<br><br>
Partial Sorting:np.argpartition(-scores, num_results) 對 -scores 執行部分排序。頂部 num_results 元素的索引將會移到陣列的前 num_results 位置，但它們不會在這些位置內排序。此操作比完全排序更有效，因為它不需要對整個陣列進行排序，只需確保頂部 num_results 元素位於正確的位置。<br><br>
Selecting Top Results:[:num_results] 從部分排序的陣列中選擇前 num_results 索引，這些索引對應於原始陣列中最高 num_results 分數的位置。



In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        # sorts these scores in descending order and selects the indices of the top num_results
        idx = np.argpartition(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

# Q4. Hit-rate for our search engine


為了計算 VectorSearchEngine 的命中率，我們將搜尋結果與真實數據進行比較。命中率衡量正確文件位於搜尋引擎返回的最佳結果中的查詢的比例。

In [None]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [43]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [44]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [45]:
# Define the search function
def search_function(query):
    v_query = embedding_model.encode(query['question'])
    return search_engine.search(v_query, num_results=5)

In [41]:
# Calculate the hit rate
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
results = evaluate(ground_truth, search_function)


100%|██████████| 1830/1830 [01:24<00:00, 21.78it/s]


In [47]:
results['hit_rate']


0.9398907103825137

Q4-A: 0.93

# Q5. Indexing with Elasticsearch
