In [5]:
import minsearch
import openai
import json

In [2]:
with open("documents.json", "rb") as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course in docs_raw:
    for doc in course["documents"]:
        doc['course'] = course["course"]
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

In [6]:
q = "can i still enroll in the course after it has started"

In [7]:
index.fit(documents)

<minsearch.Index at 0x725ed15005e0>

In [8]:
from openai import OpenAI

In [9]:
client = OpenAI()

In [10]:
response = client.chat.completions.create(
    model = "gpt-4o",
    messages=[{"role": "user", "content": q}]
)
response.choices[0].message.content

"Whether you can enroll in a course after it has started depends on several factors, including the institution or platform offering the course, the course policies, and how much of the course has already been completed. Here are some general guidelines:\n\n1. **Institution/Platform Policies**: Check the specific policies of the institution or online platform offering the course. Some universities and platforms like Coursera or Udemy may allow late enrollment within a certain time frame after the course starts.\n\n2. **Course Structure**: Consider the structure of the course. Self-paced courses often offer more flexibility for late enrollment compared to instructor-led courses with scheduled sessions and assignments.\n\n3. **Instructor's Discretion**: Sometimes, the course instructor may have the discretion to allow late enrollment. It can be helpful to reach out directly to the instructor or the course administration to explain your situation.\n\n4. **Catch-Up Feasibility**: Assess whe

In [11]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        filter_dict={"course" : "data-engineering-zoomcamp"},
        query = query,
        boost_dict = boost,
        num_results = 5
    )
    return results

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()
    
    context = ""
        
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
def llm(prompt):
    response = client.chat.completions.create(
    model = "gpt-4o",
    messages=[{"role": "user", "content": prompt}]
)
    return response.choices[0].message.content

In [14]:
query = "how do i build kafka"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [15]:
rag("how do i get the certificate")

'To obtain a certificate for the course, you need to complete the course with a “live” cohort. Certificates are not awarded for those who complete the course in a self-paced mode. This is because peer-reviewing of capstone projects is required, and you can only peer-review projects when the course is running live.'

In [16]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
from elasticsearch import Elasticsearch

In [18]:
es_client = Elasticsearch("http://localhost:9200")

In [19]:
es_client.info()

ObjectApiResponse({'name': '79c473686405', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'wx8PgtqcRw-7YZnbPgOzoQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [20]:
query = "I just disovered the course. Can I still join it?"

In [21]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


index_name = "course-questions"
es_client.indices.create(index= index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [22]:
from tqdm.auto import tqdm

In [23]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [24]:
query = "I just disovered the course. Can I still join it?"

In [25]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response["hits"]["hits"]:
        result_docs.append(hit['_source'])
    return result_docs

In [26]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [27]:
rag(query)

"Yes, you can still join the course even after the start date. You are eligible to submit the homeworks regardless of registration. However, make sure to adhere to the deadlines for the final projects. Don't leave everything for the last minute."