In [137]:
import json
import minsearch

### gathering context

In [140]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [141]:
# keyword_fields it's fields on which we want filter our search

index = minsearch.Index(text_fields=["question", "text", "section"], keyword_fields=["course"])

In [142]:
q = "the course has already started, can I still enroll?"

In [143]:
index.fit(documents)

<minsearch.Index at 0x1547d7350>

In [144]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query, filter_dict={"course": "data-engineering-zoomcamp"}, boost_dict=boost, num_results=5
    )

    return results

### openai

In [153]:
# %env OPENAI_API_KEY=value

In [154]:
from openai import OpenAI

client = OpenAI()

In [155]:
response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": q}])

response.choices[0].message.content

"It depends on the specific course and institution. Most educational institutions have policies regarding late enrollment. Here are some general steps you can take to determine if late enrollment is possible:\n\n1. **Check the Course Enrollment Policies:** Review the institution's guidelines on late enrollment, which are usually available on their website or in the course catalog.\n\n2. **Contact the Instructor:** Reach out to the course instructor to express your interest and explain your situation. They may be able to offer you special accommodations or advise you on the next steps.\n\n3. **Consult the Registrar:** The registrar's office can provide definitive answers regarding deadlines and any possible penalties or requirements for late enrollment.\n\n4. **Consider Online Courses:** Some online courses and programs have more flexible enrollment periods, allowing you to start even after the course has begun.\n\n5. **Prepare to Catch Up:** If you are allowed to enroll late, be prepar

In [164]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
"""


def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = (
            context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        )

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [168]:
def llm_request(prompt):
    response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content

In [169]:
query = "how do I run kafka?"


def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_request(prompt)
    return answer

In [171]:
answer = rag(query)

In [172]:
print(answer)

To run Kafka producer, consumer, kstreams, etc., in the terminal, navigate to your project directory and use the following command:

```sh
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

Make sure to replace `<jar_name>` with the actual name of your JAR file.


### Elasticsearch

In [173]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://localhost:9200")

In [174]:
es_client.info()

ObjectApiResponse({'name': 'e60b8163a9c5', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ry3ISRUrQ8-Kb_xvVUGppA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [177]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [180]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1015/1015 [00:03<00:00, 298.73it/s]


In [189]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": "data-engineering-zoomcamp"}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [190]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_request(prompt)
    return answer

In [191]:
query = "how do I run kafka?"
answer = rag(query)

In [193]:
print(answer)

To run Kafka, you can follow these instructions:

If you are using Java Kafka and want to run a producer, consumer, KStreams, etc., you can do so from the terminal. Navigate to your project directory and execute the following command:

```sh
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

Replace `<jar_name>` with the actual name of your JAR file. This command compiles and runs your Java Kafka application.

For running Python Kafka code, it is recommended to create a virtual environment and install the necessary dependencies by using the provided `requirements.txt` file. Here’s how you can do it:

1. **Create a virtual environment** (run this command only once):
    ```sh
    python -m venv env
    ```

2. **Activate the virtual environment**:
    - On MacOS and Linux:
      ```sh
      source env/bin/activate
      ```
    - On Windows:
      ```sh
      env\Scripts\activate
      ```

3. **Install the required packages**:
    ```sh