In [6]:
import minsearch
import json

In [7]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)


In [8]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)


In [9]:
documents[0]

{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [10]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [11]:
q = 'the course has already started, can I still enroll?'

In [12]:
q = 'the course has already started, can I still enroll?'


In [13]:
index.fit(documents)


<minsearch.minsearch.Index at 0x115e7a240>

In [14]:
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'If a course has already started, whether you can still enroll typically depends on the specific policies of the institution or platform offering the course. Here are a few steps you can take:\n\n1. **Check with the Institution**: Contact the admissions office or the course instructor to inquire about late enrollment policies.\n\n2. **Review Enrollment Policies**: Look for information on the course’s website or in any materials provided about late enrollment or add/drop deadlines.\n\n3. **Consider Audit Options**: Some institutions may allow you to audit the course, which means you can attend classes but may not receive credit.\n\n4. **Explore Online Options**: If the course is offered online, sometimes platforms allow you to enroll late and catch up at your own pace.\n\n5. **Discuss with the Instructor**: If the course is small or has a flexible format, the instructor might be willing to accommodate a late start.\n\nKeep in mind that catching up with course material might require extr

In [15]:
import pprint
pprint.pprint(response.choices[0].message.content)

('If a course has already started, whether you can still enroll typically '
 'depends on the specific policies of the institution or platform offering the '
 'course. Here are a few steps you can take:\n'
 '\n'
 '1. **Check with the Institution**: Contact the admissions office or the '
 'course instructor to inquire about late enrollment policies.\n'
 '\n'
 '2. **Review Enrollment Policies**: Look for information on the course’s '
 'website or in any materials provided about late enrollment or add/drop '
 'deadlines.\n'
 '\n'
 '3. **Consider Audit Options**: Some institutions may allow you to audit the '
 'course, which means you can attend classes but may not receive credit.\n'
 '\n'
 '4. **Explore Online Options**: If the course is offered online, sometimes '
 'platforms allow you to enroll late and catch up at your own pace.\n'
 '\n'
 '5. **Discuss with the Instructor**: If the course is small or has a flexible '
 'format, the instructor might be willing to accommodate a late start.

In [16]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [17]:
def build_prompt(query, search_results):
    template = (
        "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT "
        "from the FAQ database. Use only the facts from the CONTEXT when answering.\n\n"
        "QUESTION: {question}\n\nCONTEXT:\n{context}"
    )

    # Efficiently assemble the context blocks
    blocks = [
        f"section: {doc['section']}\n"
        f"question: {doc['question']}\n"
        f"answer: {doc['text']}"
        for doc in search_results
    ]
    context = "\n\n".join(blocks)

    return template.format(question=query, context=context)


In [18]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [19]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [20]:
import minsearch

pprint.pprint(rag(query))

('To run Kafka, you need to ensure that your environment is properly set up, '
 'especially if you are encountering issues like "ModuleNotFoundError". Here '
 'are the steps:\n'
 '\n'
 '1. **Using Python**:\n'
 '   - Create a virtual environment:\n'
 '     ```bash\n'
 '     python -m venv env\n'
 '     source env/bin/activate  # For MacOS/Linux\n'
 '     # On Windows, use: env\\Scripts\\activate\n'
 '     ```\n'
 '\n'
 '   - Install the necessary dependencies:\n'
 '     ```bash\n'
 '     pip install -r ../requirements.txt\n'
 '     ```\n'
 '\n'
 '   - Ensure that Docker images are up and running if needed.\n'
 '\n'
 '2. **Running Java Kafka applications**:\n'
 '   - In your project directory, run the command specific to your Java '
 'application:\n'
 '     ```bash\n'
 '     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out '
 'src/main/java/org/example/JsonProducer.java\n'
 '     ```\n'
 '\n'
 'For Python users encountering module issues, it is suggested to install '
 '`kafka-python-

In [21]:
pprint.pprint(rag('the course has already started, can I still enroll?'))


('Yes, you can still enroll in the course after it has started. You are '
 'eligible to submit the homework, but be mindful of the deadlines for turning '
 "in homework and final projects. It's important not to leave everything for "
 'the last minute.')


In [22]:
documents[0]


{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [27]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    "http://localhost:9200",
    # forces 8-compatible headers
)

In [26]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    
}
}
index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/5j22wA5zQsyglii6u4JNGg] already exists')

In [28]:
documents[0]


{'text': "Data Engineering Zoomcamp FAQ\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When does the course start?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
from tqdm.auto import tqdm

In [30]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1122 [00:00<?, ?it/s]

In [31]:
query = 'I just disovered the course. Can I still join it?'


In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [34]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [36]:
pprint.pprint(rag(query))

  response = es_client.search(index=index_name, body=search_query)


('Yes, you can still join the course even if you discovered it after the start '
 'date. You are eligible to submit homework without registering. However, make '
 'sure to meet the deadlines for turning in homework and final projects.')
