In [33]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2025-06-25 21:37:31--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-06-25 21:37:31 ERROR 404: Not Found.



In [34]:
import minsearch

In [35]:
import json

In [36]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)
    f_in.close()

In [37]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [38]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [39]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

In [40]:
query = "the course has already started, can I still enroll?"

In [41]:
index.fit(documents)

<minsearch.Index at 0x173d1f310>

In [42]:
from dotenv import load_dotenv
import os
from openai import OpenAI

In [43]:
# Load environment variables from .env file
load_dotenv()

# Get OpenAI API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

In [44]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query = query,
        filter_dict = {'course': 'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return results

In [45]:
def build_prompt(docs, question):
    context = ""

    for doc in docs:
        context = context + f"section: {doc['section']}\n"
        context = context + f"question: {doc['question']}\n"
        context = context + f"answer: {doc['text']}\n\n"
    
    prompt_template = """
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Use only the facts from the CONTEXT. If you don't know the answer, say that you don't know.
    If the question is not related to the CONTEXT, say that you don't know.
    If the question is not clear, ask for more details.

    QUESTION: {question}

    CONTEXT: {context}
    """

    prompt = prompt_template.format(question=question, context=context).strip()

    return prompt

In [46]:
def llm(prompt):
    client = OpenAI(api_key=openai_api_key) 

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [47]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://localhost:9200")

In [48]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [49]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

es_client.indices.refresh(index=index_name)

  0%|          | 0/948 [00:00<?, ?it/s]

ObjectApiResponse({'_shards': {'total': 1, 'successful': 1, 'failed': 0}})

In [50]:
query = "the course has already started, can I still enroll?"

In [51]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name , body=search_query)   

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [52]:
q = "how do I run kafka?"
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(search_results, query)
    answer = llm(prompt)
    return answer

In [53]:
print(rag(q))

To run Kafka, you would typically execute the Java-based Kafka applications by running commands in the terminal. Based on the provided context, you can run a Kafka producer or consumer in your project directory using a command like the following, which adjusts the classpath and specifies the Java class to run:

```bash
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

This command assumes you have already compiled your Java classes and have the proper jar files available in the specified directories.
