In [4]:
import json

### gathering context

In [6]:
with open("../01-intro/documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [142]:
q = "the course has already started, can I still enroll?"

### ollama phi3

In [15]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:11434/v1/",
    api_key="ollama",
)

### Elasticsearch

In [7]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://localhost:9200")

In [8]:
es_client.info()

ObjectApiResponse({'name': '7cfb7484ed0b', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'iNrf-i6FQ3CpdzKpXNqwzA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1015 [00:00<?, ?it/s]

In [12]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": "data-engineering-zoomcamp"}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [13]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
"""


def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = (
            context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        )

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
def llm(prompt):
    response = client.chat.completions.create(model="phi3", messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content

In [19]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer, search_results

In [20]:
query = "how do I run kafka?"
answer, search_results = rag(query)

In [21]:
print(answer)

 To run Kafka in a terminal for Java projects like JsonProducer, follow these steps based on Section 6's context:

1. Open your project directory using Git and clone it if necessary by watching an online tutorial about cloning from GitHub as suggested in the FAQ database under "General course-related questions".

2. After ensuring that DuckDB is installed locally, create a virtual environment specific for Kafka operations:
```
python -m venv env  # This should be executed only once to set up your Python environment and install required packages without affecting other projects' dependencies on the system-wide Python installation. Use 'source env/bin/activate'. On Windows, replace it with '.env\Scripts\activate', where `.env` is a directory containing an `environment.yml`.
pip install -r ../requirements.txt  # Assuming your requirements file includes DuckDB and any Kafka-related packages listed afterward (e.g., kafkastream). It's vital to include both the Java code path in 'src/main/' a

In [23]:
query = "I just disovered the course. Can I still join it?"
answer, search_results = rag(query)

In [24]:
print(answer)

 Yes, you can still join the course even if you discover it later or have already joined but are late in doing so. As long as there's time left for submitting homeworks and no final projects need immediate attention (since deadlines might apply), joining is possible. Just remember to submit your work on time!

It seems like this FAQ database encourages flexibility regarding when one can join the course, provided they manage their timelines effectively with respect to assignment submission and understanding that a confirmation email or registration list isn't mandatory for participation as acceptance is assumed once you register. The slack channel remains open for support even in self-paced mode, but it’s advised first checking the FAQ document which might answer your questions already covered therein.

As always with courses that allow late registrations or are designed to be flexible and student-driven (it's not clear if this course is a traditional setting based on context), each sit