In [None]:
Install Required Libraries

In [None]:
!pip install minsearch
!pip install -q "qdrant-client[fastembed]>=1.14.2"
!pip install google-generativeai

In [6]:
import minsearch
import json
import google.generativeai as genai
import os
import requests 
import tqdm as notebook_tqdm

In [24]:
genai.configure(api_key="AIzaSyA5FWmxN9u1AcMkw9e52EqikjY5nRNcCD0") #--- Input your Gemini API Key
model = genai.GenerativeModel('gemini-2.5-flash')

In [7]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [12]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [19]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x1a8ed45e000>

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [28]:
q = 'the course has already started, can I still enroll?'

In [29]:
response = model.generate_content([
     {"role": "user", "parts": [q]}
])

In [None]:
print(response.text)

In [33]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [34]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [35]:
def llm(prompt):
    response = model.generate_content(
        {"role": "user", "parts": [prompt]}
    )
    return response.text

In [36]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [40]:
rag('the course has already started, can I still enroll?')

"Yes, even if you don't register, you are still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects, so don't leave everything for the last minute."

In [None]:
## RAG with Vector Search

In [None]:
#Import Required Libraries & Connect to Qdrant

In [47]:
from qdrant_client import QdrantClient, models

In [42]:
qd_client = QdrantClient("http://localhost:6333")

In [48]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [49]:
collection_name = "zoomcamp-faq"

In [50]:
qd_client.delete_collection(collection_name=collection_name)

False

In [51]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [52]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [53]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [56]:
!pip install huggingface_hub[hf_xet] #Package for Xet Storag

Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl.metadata (883 bytes)
Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 2.7/2.7 MB 31.8 MB/s eta 0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.1.5


In [54]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.53s/it]


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [58]:
question = 'I just discovered the course. Can I still join it?'

In [59]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [60]:
def rag_vector_search(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [63]:
rag_vector_search('I just discovered the course. Can I join now?')

vector_search is used


"Yes, even if you don't register, you're still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects."

In [None]:
#Q1. Embedding the query
#Embed the query: 'I just discovered the course. Can I join now?'. Use the 'jinaai/jina-embeddings-v2-small-en' model.

#You should get a numpy array of size 512.
#What's the minimal value in this array?

In [67]:
#Q1 - Initialize
!pip install fastembed
from fastembed import TextEmbedding
import numpy as np



In [84]:
query = 'I just discovered the course. Can I join now?'

# 1. Initialize the FastEmbed model
model_name = "jinaai/jina-embeddings-v2-small-en"
embedding_model = TextEmbedding(model_name=model_name)

EMBEDDING_DIMENSIONALITY = 512


embeddings_generator = embedding_model.embed(query)
embeddings_list = list(embeddings_generator)
len(embeddings_list[0])  


512

In [85]:
# 4. Find the minimal value in the array
min_value = np.min(embeddings_list)

print(f"The minimal value in the embedding array is: {min_value}")

The minimal value in the embedding array is: -0.11726373885183883


In [88]:
import numpy as np
np.linalg.norm(query)

ValueError: could not convert string to float: 'I just discovered the course. Can I join now?'