In [None]:
import minsearch
import json

In [None]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

SELECT \* WHERE course = 'data-engineering-zoomcamp';


In [None]:
q = "the course has already started, can I still enroll?"

In [8]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1237434ca00>

In [9]:
from openai import OpenAI

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [13]:
client = OpenAI()

In [None]:
response = client.chat.completions.create(
    model="gpt-4o", messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course after it has started depends on the institution and the specific course's policies. Here are some general steps you can take to find out:\n\n1. **Check the Course Website:** Often, course websites or platforms will have information on enrollment deadlines or policies about late enrollment.\n\n2. **Contact the Instructor or Department:** Reach out directly to the course instructor or the department offering the course. They may have the authority to allow late enrollment or provide you with guidance.\n\n3. **Visit the Registrar’s Office:** If you are part of a larger institution, the registrar's office might be able to provide detailed information regarding enrollment policies and procedures.\n\n4. **Review Enrollment Policies:** Some institutions have specific policies regarding late enrollment, including potential fees or penalties. Be sure to review these before proceeding.\n\n5. **Consider Online Courses:** If it’s not possible to enroll in-

In [None]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5,
    )

    return results

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = (
            context
            + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        )

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [None]:
query = "how do I run kafka?"


def rag(query):


    search_results = search(query)


    prompt = build_prompt(query, search_results)

    answer = llm(prompt)

    return answer

In [19]:
rag(query)

'To run Kafka as a Java application, navigate to your project directory and execute the following command in the terminal:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nMake sure to replace `<jar_name>` with the actual name of the jar file you built.'

In [None]:
rag("the course has already started, can I still enroll?")

"Yes, you can still enroll in the course even after it has started. You will be eligible to submit the homework, but keep in mind that there will be deadlines for turning in the final projects, so it's important not to leave everything until the last minute."

In [21]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [22]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch("http://localhost:9200")

In [24]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
from tqdm.auto import tqdm

In [27]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [28]:
query = "I just disovered the course. Can I still join it?"

In [29]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": "data-engineering-zoomcamp"}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag(query)

"Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homework assignments without registering. However, keep in mind that there are deadlines for submitting the final projects, so it's advisable not to leave everything until the last minute."

In [None]:
rag("What is this course about?")

'The context does not provide specific information regarding the content or subject matter of the course itself. Therefore, based on the given context, I am unable to answer what this course is about.'

In [None]:
rag("How do I get started?")

'To get started with the course, you need to create a GitHub account and clone the course repository to your local machine. This will allow you to access the course materials and make any necessary changes or additions. You can follow the video tutorial "Git for Everybody: How to Clone a Repository from GitHub" for guidance on cloning a repository. Also, if you plan to create your own repositories, a tutorial on setting up a repository is available at [Atlassian\'s guide](https://www.atlassian.com/git/tutorials/setting-up-a-repository). Remember to use a `.gitignore` file to exclude large database files, `.csv`, `.gz`, and sensitive information like passwords or keys, even if your repository is private.'

In [None]:
rag("How do I get a certificate?")

'To obtain a certificate for the course, you must complete it with a “live” cohort. Certificates are not awarded for completing the course in a self-paced mode because you need to peer-review capstone projects after submitting your own project, which can only be done while the course is actively running.'

In [None]:
rag("What projects will I work on?")

'The context provided does not include information about the specific projects you will work on in the course. Please refer to the course syllabus or contact the course instructor for details on the projects involved.'

In [None]:
rag("How do I make my own project?")

'To make your own project, you can use any tool you want. This flexibility allows you to design and implement your project in a way that best suits your preferences and needs.'