In [2]:
!pip install minsearch



In [3]:
import minsearch

In [4]:
import json

In [5]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [7]:
documents[0]

{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [9]:
q = 'the course has already started, can I still enroll?'

In [10]:
pip install google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [11]:
index.fit(documents)

<minsearch.minsearch.Index at 0x28e6c3819a0>

In [12]:
import google.generativeai as genai
import os

In [13]:
genai.configure(api_key="AIzaSyDB3TmTeOs_UMZMfSkCo_ShzrdBi34guoY") #--- Input your Gemini API Key

In [14]:
model = genai.GenerativeModel('gemini-2.0-flash-lite')

In [15]:
response = model.generate_content([
     {"role": "user", "parts": [q]}
])

In [16]:
print(response.text)

Whether or not you can still enroll in a course that has already started depends on a few factors:

*   **The Institution:**
    *   **University/College Policies:** Most institutions have specific policies regarding late enrollment. Some might have a strict cutoff date, while others might allow late registration with penalties or instructor permission.
    *   **Course Type:** Online courses often have more flexible start dates than in-person courses. Self-paced courses may even allow enrollment at any time.
    *   **Specific Course:** The instructor may have a say in allowing late enrollment, especially if the course is small or requires a lot of interaction.

*   **Your Circumstances:**
    *   **Reason for Late Enrollment:** Were there extenuating circumstances (e.g., illness, family emergency)? Providing documentation may help your case.
    *   **Your Grade Level:** If you're a continuing student with an established record, late enrollment might be easier than for a new student.

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
def llm(prompt):
    response = model.generate_content(
        {"role": "user", "parts": [prompt]}
    )
    return response.text

In [21]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [22]:
rag(query)

"To run Kafka, you can use these steps:\n\n1.  **Python Kafka:**\n    *   Create a virtual environment and run `requirements.txt` and the python files in that environment.\n        *   To create a virtual environment and install packages (run only once):\n            *   `python -m venv env`\n            *   `source env/bin/activate`\n            *   `pip install -r ../requirements.txt`\n        *   To activate it (you'll need to run it every time you need the virtual env):\n            *   `source env/bin/activate`\n        *   To deactivate it:\n            *   `deactivate`\n        *   (For Windows, the path is `env/Scripts/activate`).\n    *   The virtual environment should be created only to run the python file. Docker images should first all be up and running.\n2.  **Java Kafka:**\n    *   In the project directory, run:\n        *   `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`"

In [23]:
rag('the course has already started, can I still enroll?')

'Yes, you can still join the course after it has started.\n'

In [24]:
documents[0]

{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [25]:
from elasticsearch import Elasticsearch

In [2]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

NameError: name 'es_client' is not defined