In [3]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [4]:
#!wget https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json

In [1]:
import minsearch
import json


In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [6]:
q = 'the course has already started, can I still enroll?'

In [7]:
index.fit(documents)


<minsearch.Index at 0x7bba1c9ab550>

In [8]:
from openai import OpenAI
#client = OpenAI(api_key="")
client =OpenAI()

In [9]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It's possible to still enroll in a course that has already started, but it depends on the policies of the institution or organization offering the course. Here are some steps you can take to find out if you can still enroll:\n\n1. **Check the Course Website or Syllabus**: Sometimes, course websites or syllabi will have information about late enrollment policies.\n  \n2. **Contact the Instructor or Course Coordinator**: Reach out directly to the instructor or the course coordinator via email or during office hours to inquire about the possibility of joining the class late. Make sure to explain your situation and express your enthusiasm for the course.\n\n3. **Speak to Your Academic Advisor**: If you are a student at a school or university, your academic advisor can provide guidance and may be able to facilitate your late enrollment.\n\n4. **Visit the Registrar's Office**: For university courses, the registrar’s office will have the most accurate information about deadlines and late enr

In [10]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
rag(query)


'To run Kafka, you can follow the instructions based on whether you are using Java or Python.\n\nFor **Java Kafka**:\n1. Navigate to the project directory.\n2. Use the following command to run your Kafka application (producer, consumer, kstreams, etc.):\n   ```sh\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\nFor **Python Kafka**:\n1. Create a virtual environment and install the necessary packages.\n   ```sh\n   python -m venv env\n   source env/bin/activate\n   pip install -r ../requirements.txt\n   ```\n2. To use the virtual environment, run:\n   ```sh\n   source env/bin/activate\n   ```\n3. To deactivate the virtual environment when done:\n   ```sh\n   deactivate\n   ```\n\nNote: On Windows, you will use `env/Scripts/activate` instead of `env/bin/activate`.\n\nMake sure that any required Docker images are up and running before you start.'

In [15]:
rag('the course has already started, can I still enroll?')


"Yes, you can still enroll in the course after it has started. While you're eligible to submit homework even if you register late, be mindful of the deadlines for the final projects and avoid leaving everything for the last minute."

In [16]:
documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
from elasticsearch import Elasticsearch


In [18]:
es_client = Elasticsearch('http://localhost:9200') 


In [19]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})