In [None]:
import requests

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x23ce5333eb0>

In [4]:
from openai import OpenAI

client = OpenAI()

In [None]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5,
    )

    return results

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = (
            context
            + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        )

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [8]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag("how do I run kafka?")

"To run Kafka, it depends on whether you're using Java or Python. Here’s how you can do it for both:\n\n1. **Java Kafka Producer/Consumer/KStreams:**\n   - Navigate to your project directory.\n   - Run the following command in the terminal, replacing `<jar_name>` with your jar file's name:\n     ```bash\n     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n     ```\n\n2. **Python Kafka Producer:**\n   - Create a virtual environment and install the necessary packages by following these steps:\n     ```bash\n     python -m venv env\n     source env/bin/activate\n     pip install -r ../requirements.txt\n     ```\n   - Activate the virtual environment each time you run your Python files:\n     ```bash\n     source env/bin/activate\n     ```\n   - If you're on Windows, use:\n     ```bash\n     env\\Scripts\\activate\n     ```\n   - To deactivate the environment, simply run:\n     ```bash\n     deactivate\n     ```\n   - Ensure that your Docke

In [None]:
rag("the course has already started, can I still enroll?")

"Yes, even after the course has started, you can still enroll. You are eligible to submit the homework, but be mindful of the deadlines for final projects, as it's best not to leave everything until the last minute."