In [4]:
!ls

README.md    documents.json	minsearch.py	 parse-faq_ts.ipynb
__pycache__  elastic-search.md	parse-faq.ipynb  rag-intro.ipynb


In [2]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [3]:

import minsearch
import json
import os

In [9]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [10]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
documents[-1]

{'text': 'Each submitted project will be evaluated by 3 (three) randomly assigned students who have also submitted the project.\nYou will also be responsible for grading the projects from 3 fellow students yourself. Please be aware that: not complying to this rule also implies you failing to achieve the Certificate at the end of the course.\nThe final grade you get will be the median score of the grades you get from the peer reviewers.\nAnd of course, the peer review criteria for evaluating or being evaluated must follow the guidelines defined here (TBA for link).',
 'section': 'Certificates:',
 'question': 'How is my capstone project going to be evaluated?',
 'course': 'mlops-zoomcamp'}

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [12]:
index

<minsearch.Index at 0x7bf69f168790>

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [13]:
q = 'the course has already started, can I still enroll?'

In [14]:
index.fit(documents)

<minsearch.Index at 0x7bf69f168790>

In [15]:
from openai import OpenAI

In [16]:
client = OpenAI()

In [17]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It depends on the specific course and institution's policies. Many schools and online platforms allow late enrollment within a certain timeframe. Here are some steps you can take:\n\n1. **Check the Enrollment Deadline:** Look up the course information to see if there's a specific cut-off date for enrollment.\n\n2. **Contact the Instructor or Administration:** Reach out to the instructor or the administration office to explain your situation and ask if late enrollment is possible.\n\n3. **Review Course Materials:** If you’re allowed to enroll late, make sure to catch up on any missed lectures, assignments, and notes to stay on track.\n\n4. **Understand the Policies:** Be aware of any penalties or challenges associated with late enrollment, such as missed grades or the need to catch up quickly.\n\nAct quickly, as the sooner you address the issue, the better your chances of being able to join the course."

In [18]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [19]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [20]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [21]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [22]:
rag(query)

'To run Kafka, you can follow these instructions based on the provided contexts:\n\n1. **Setting up Java Kafka**:\n    - Navigate to your project directory.\n    - Run the following command in the terminal:\n      ```\n      java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n      ```\n    - Replace `<jar_name>` with the appropriate jar file name for your project.\n\n2. **Running Python Kafka**:\n    - First, create a virtual environment and install the necessary packages:\n      ```sh\n      python -m venv env\n      source env/bin/activate\n      pip install -r ../requirements.txt\n      ```\n    - Note: On Windows, the activation command is different:\n      ```sh\n      .\\env\\Scripts\\activate\n      ```\n    - To deactivate the virtual environment after running your scripts:\n      ```sh\n      deactivate\n      ```\n\n3. **Addressing Permissions for Build Scripts (Linux/macOS)**:\n    - If you encounter a “Permission denied” error w

In [23]:
rag('the course has already started, can I still enroll?')

'Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework assignments, but keep in mind there are deadlines for turning in the final projects. Make sure not to leave everything until the last minute.'

In [25]:
from elasticsearch import Elasticsearch


In [29]:
es_client = Elasticsearch('http://localhost:9200') 

In [30]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [31]:
from tqdm.auto import tqdm

In [32]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1025 [00:00<?, ?it/s]

In [33]:
query = 'I just disovered the course. Can I still join it?'

In [34]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [35]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [36]:
rag(query)

'Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homeworks, but keep in mind that there will be deadlines for turning in the final projects. So make sure not to leave everything for the last minute.'