In [3]:
from fastembed import TextEmbedding
import numpy as np

query_text = 'I just discovered the course. Can I join now?'
model_name = "jinaai/jina-embeddings-v2-small-en"

embedding_model = TextEmbedding(model_name=model_name)
print(f"Model '{model_name}' loaded successfully.")

embeddings_list = list(embedding_model.embed(query_text))
query_embedding = embeddings_list[0]

dimensionality = query_embedding.shape[0]
min_value = query_embedding.min()

print(f"\nQuery: '{query_text}'")
print(f"Embedding dimensionality: {dimensionality}")
print(f"Minimum value in embedding: {min_value}")

print("\n--- Q1 Answer ---")
print(f"The minimum value is {min_value:.4f}")

Model 'jinaai/jina-embeddings-v2-small-en' loaded successfully.

Query: 'I just discovered the course. Can I join now?'
Embedding dimensionality: 512
Minimum value in embedding: -0.11726373885183883

--- Q1 Answer ---
The minimum value is -0.1173


In [4]:
import numpy as np

new_doc_text = 'Can I still join the course after the start date?'

new_doc_embedding_list = list(embedding_model.embed(new_doc_text))
new_doc_embedding = new_doc_embedding_list[0]

cosine_similarity = query_embedding.dot(new_doc_embedding)

print(f"Cosine similarity between query and new document: {cosine_similarity}")

print("\n--- Q2 Answer ---")
print(f"The calculated similarity is {cosine_similarity:.4f}")

Cosine similarity between query and new document: 0.9008528895674548

--- Q2 Answer ---
The calculated similarity is 0.9009


In [6]:
import numpy as np

documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.', 'section': 'General course-related questions', 'question': 'Course - What can I do before the course starts?', 'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.', 'section': 'General course-related questions', 'question': 'How can we contribute to the course?', 'course': 'data-engineering-zoomcamp'}
]

doc_texts = [d['text'] for d in documents]
doc_embeddings = embedding_model.embed(doc_texts)
doc_embeddings_array = np.array(list(doc_embeddings))

scores = doc_embeddings_array.dot(query_embedding)
best_doc_index = np.argmax(scores)

print("Similarity scores for each document:")
print(scores)
print(f"\nIndex of the document with the highest similarity: {best_doc_index}")

print("\n--- Q3 Answer ---")
print(f"The index with the highest score is {best_doc_index}")

Similarity scores for each document:
[0.76296847 0.81823782 0.80853974 0.7133079  0.73044992]

Index of the document with the highest similarity: 1

--- Q3 Answer ---
The index with the highest score is 1


In [7]:
import numpy as np

full_texts = [f"{d['question']} {d['text']}" for d in documents]
full_text_embeddings = embedding_model.embed(full_texts)
full_text_embeddings_array = np.array(list(full_text_embeddings))

new_scores = full_text_embeddings_array.dot(query_embedding)
new_best_doc_index = np.argmax(new_scores)

print("New similarity scores for each document (using 'question' + 'text'):")
print(new_scores)
print(f"\nNew index of the document with the highest similarity: {new_best_doc_index}")

print("\n--- Q4 Answer ---")
print(f"The new best index is {new_best_doc_index}.")

if new_best_doc_index == best_doc_index:
    print("The result is the same as in Q3.")
else:
    print("The result is different from Q3.")
    print("Reason: Concatenating the 'question' with the 'text' provides more specific semantic context to the embedding model, which can lead to a better alignment with the query and thus a different ranking.")

New similarity scores for each document (using 'question' + 'text'):
[0.85145432 0.84365942 0.8408287  0.7755158  0.80860078]

New index of the document with the highest similarity: 0

--- Q4 Answer ---
The new best index is 0.
The result is different from Q3.
Reason: Concatenating the 'question' with the 'text' provides more specific semantic context to the embedding model, which can lead to a better alignment with the query and thus a different ranking.


In [8]:
from fastembed import TextEmbedding

model_name_q5 = "BAAI/bge-small-en"
embedding_model_q5 = TextEmbedding(model_name=model_name_q5, cache_dir="models")

sample_text = "This is a sample text to check dimensionality."
sample_embedding = list(embedding_model_q5.embed(sample_text))[0]
dimensionality_q5 = sample_embedding.shape[0]

print(f"The dimensionality of the model '{model_name_q5}' is: {dimensionality_q5}")

print("\n--- Q5 Answer ---")
print("Based on the provided options, the smallest available dimensionality for high-performance models like the one tested is 384.")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

The dimensionality of the model 'BAAI/bge-small-en' is: 384

--- Q5 Answer ---
Based on the provided options, the smallest available dimensionality for high-performance models like the one tested is 384.


In [10]:
import requests
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

# 1. Download and filter the documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course['course']
    if course_name == 'machine-learning-zoomcamp':
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)

print(f"Number of documents for 'machine-learning-zoomcamp': {len(documents)}")

# 2. Initialize the new embedding model
model_name_q6 = "BAAI/bge-small-en-v1.5" # Using the full name for clarity
embedding_model_q6 = TextEmbedding(model_name=model_name_q6, cache_dir="models")

# 3. Initialize Qdrant client and create a new collection
qd_client = QdrantClient("http://localhost:6333")
collection_name_q6 = "ml-zoomcamp-faq"
embedding_dimensionality_q6 = 384

qd_client.recreate_collection(
    collection_name=collection_name_q6,
    vectors_config=models.VectorParams(
        size=embedding_dimensionality_q6,
        distance=models.Distance.COSINE
    )
)
print(f"Collection '{collection_name_q6}' created or recreated successfully.")

# 4. Prepare and upsert points with manual embedding

texts_to_embed_q6 = [f"{d['question']} {d['text']}" for d in documents]
embeddings_q6 = embedding_model_q6.embed(texts_to_embed_q6, batch_size=32)

points_q6 = []
for idx, doc in enumerate(documents):
    point = models.PointStruct(
        id=idx,
        vector=next(embeddings_q6).tolist(), # Using next() on the generator
        payload=doc
    )
    points_q6.append(point)

qd_client.upsert(
    collection_name=collection_name_q6,
    points=points_q6,
    wait=True
)
print("Upsert operation completed.")

# 5. Search the collection
query_text_q1 = 'I just discovered the course. Can I join now?'
query_vector = list(embedding_model_q6.embed(query_text_q1))[0]

search_result = qd_client.search(
    collection_name=collection_name_q6,
    query_vector=query_vector,
    limit=1
)

highest_score = search_result[0].score

print(f"\nSearch query: '{query_text_q1}'")
print(f"The highest score from the search result is: {highest_score}")

print("\n--- Q6 Answer ---")
print(f"The highest score is {highest_score:.2f}")

Number of documents for 'machine-learning-zoomcamp': 375


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

  qd_client.recreate_collection(


Collection 'ml-zoomcamp-faq' created or recreated successfully.
Upsert operation completed.

Search query: 'I just discovered the course. Can I join now?'
The highest score from the search result is: 0.739779

--- Q6 Answer ---
The highest score is 0.74


  search_result = qd_client.search(
