In [98]:
from sentence_transformers import SentenceTransformer

In [99]:
homework_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [100]:
embedding = homework_model.encode("I just discovered the course. Can I still join it?")
embedding[0]

0.078222655

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [9]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [22]:
def filter_documents_by_course(documents, course_name):
    # Filter documents for the specific course
    filtered_documents = [doc for doc in documents if doc['course'] == course_name]
    return filtered_documents

In [23]:
filtered_docs = filter_documents_by_course(documents, 'machine-learning-zoomcamp')
len(filtered_docs)

375

In [101]:
embeddings = []

for doc in filtered_docs:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    qa_text_embedding = homework_model.encode(qa_text).tolist()
    embeddings.append(qa_text_embedding)

In [102]:
len(embeddings)

375

In [103]:
len(embeddings[25])

768

In [55]:
import numpy as np

In [56]:
X = np.array(embeddings)

In [57]:
X.shape

(375, 768)

In [58]:
v = np.array(embedding)

In [61]:
np.max(X.dot(v))

0.6506574320371066

In [62]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_docs, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [63]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [67]:
from tqdm.auto import tqdm

In [74]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [91]:

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    search_engine = VectorSearchEngine(documents=filtered_docs, embeddings=X)
    q = qa_text_embedding = homework_model.encode(q['question']).tolist()    
    results = search_engine.search(q, num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [92]:
hit_rate(relevance_total)

0.9398907103825137

In [93]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '8075755d8443', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'EFk9JbMyRK28mmojLgzVug', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [108]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [105]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [110]:
embedded_docs = []

for doc in filtered_docs:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    doc['text_vector'] = homework_model.encode(qa_text).tolist()
    embedded_docs.append(doc)

In [111]:
for doc in embedded_docs:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [112]:
search_term = "I just discovered the course. Can I still join it?"
vector_search_term = homework_model.encode(search_term)

In [113]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [114]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'Bm0IwZABupWVn6tlAet7',
  '_score': 0.8253288,
  '_source': {'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'}},
 {'_index': 'course-questions',
  '_id': 'CW0IwZABupWVn6tlAeuv',
  '_score': 0.73585373,
  '_source': {'question': 'I just joined. What should I do next? How can I access course materials?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Welcome to the course! Go to the course page (http://

In [115]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^5", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [117]:
es_relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    es_relevance_total.append(relevance)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [118]:
hit_rate(es_relevance_total)

0.6704918032786885