In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)


<minsearch.minsearch.Index at 0x1119a68d0>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
def build_prompt(query, search_results):
    template = (
        "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT "
        "from the FAQ database. Use only the facts from the CONTEXT when answering.\n\n"
        "QUESTION: {question}\n\nCONTEXT:\n{context}"
    )

    # Efficiently assemble the context blocks
    blocks = [
        f"section: {doc['section']}\n"
        f"question: {doc['question']}\n"
        f"answer: {doc['text']}"
        for doc in search_results
    ]
    context = "\n\n".join(blocks)

    return template.format(question=query, context=context)


In [6]:
from openai import OpenAI

openai_client = OpenAI()

In [7]:
def llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
query = 'hello sexy?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [11]:
import minsearch
import pprint
pprint.pprint(rag(query))

'Hello! How can I assist you with your course-related inquiries today?'


In [12]:
pprint.pprint(rag('the course has already started, can I still enroll?'))


('Yes, you can still enroll in the course even after it has started. You are '
 'eligible to submit homework without registering, but be mindful of the '
 'deadlines for submitting final projects and try not to delay your work until '
 'the last minute.')


In [13]:
## RAG with Vector Search

In [None]:
from qdrant_client import QdrantClient, models

In [15]:
qd_client = QdrantClient("http://localhost:6333")

In [None]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [17]:
collection_name = "zoomcamp-faq"
qd_client.delete_collection(collection_name=collection_name)


False

In [18]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [None]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [26]:
qd_client.upsert(collection_name=collection_name, points=points)


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [27]:
question = 'I just discovered the course. Can I still join it?'


In [None]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [29]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [30]:
pprint.pprint(rag('how do I run kafka?'))


vector_search is used
('To run Kafka, based on the context provided, you need to execute the Java '
 'files (such as JsonProducer.java or JsonConsumer.java) in the terminal. In '
 'the project directory, use the following command:\n'
 '\n'
 '```bash\n'
 'java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out '
 'src/main/java/org/example/JsonProducer.java\n'
 '```\n'
 '\n'
 'Ensure the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` in your Java scripts '
 'points to the correct server URL and that your cluster key and secrets are '
 'updated in `src/main/java/org/example/Secrets.java`. If your Kafka broker is '
 'not available, check your Docker container and use `docker compose up -d` to '
 'start all instances.')
