In [25]:

# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x751e1c2d7400>

In [26]:
boost = {'question': 3.0 , 'section': 0.5}
results = index.search(
    query = q,
    boost_dict= boost,
    num_results = 5
    
)

In [27]:
results

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the cour

In [29]:
boost = {'question': 3.0 , 'section': 0.5}
results = index.search(
    query = q,
    filter_dict = {'course': 'data-engineering-zoomcamp'},
    boost_dict= boost,
    num_results = 5
    
)

In [30]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [13]:
from groq import Groq
import os

In [17]:
client = Groq(api_key = os.environ['GROQ_API_KEY'])

In [18]:
response = client.chat.completions.create(
    model='llama3-70b-8192',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"I'm happy to help you with your question!\n\nTypically, the answer to this question depends on the course policies and the institution offering the course. Here are a few possible scenarios:\n\n1. **Some courses may allow late enrollment**: Depending on the course and institution, you might still be able to enroll, even if the course has already started. It's best to check with the course instructor or the institution's registration office to see if late enrollment is possible.\n2. **There might be a deadline for late enrollment**: In some cases, there may be a deadline for late enrollment, beyond which no new students can join the course. Be sure to ask about any deadlines when you inquire about enrolling.\n3. **You might need permission from the instructor**: In some cases, you may need to get permission from the course instructor to enroll late. They may consider your request if you have a valid reason for missing the initial enrollment period.\n4. **Online courses might have more 

In [19]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [33]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    # print(context)
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [32]:
test_prompt = build_prompt(query, search(query))

section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)
Also the virtual environment should be created only to run the python file. Docker images should first all be up and running.

secti

In [34]:
test_prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: how do I run kafka?\n\nCONTEXT: \nsection: Module 6: streaming with kafka\nquestion: Java Kafka: How to run producer/consumer/kstreams/etc in terminal\nanswer: In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n\nsection: Module 6: streaming with kafka\nquestion: Module “kafka” not found when trying to run producer.py\nanswer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you\'ll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on Ma

In [39]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gemma-7b-it',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
rag(query)

'Based on the provided context, to run Kafka, you need to follow these steps:\n\n1. Make sure you are in the project directory.\n2. Run the following command:\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nReplace `<jar_name>` with the actual name of your JAR file.\n\nNote: This is based on the answer provided in the context for the question "Java Kafka: How to run producer/consumer/kstreams/etc in terminal" in the "Module 6: streaming with kafka" section.'

llama3-70b-8192

In [35]:
rag('the course has already started, can I still enroll?')

'According to the context, the answer is: Yes, you can still enroll in the course even after it has started.'

llama3-8b-8192

In [37]:
rag('the course has already started, can I still enroll?')

'Based on the FAQ database, the answer to the QUESTION "Can I still enroll now that the course has already started?" is YES.'

gemma-7b-it

In [41]:
rag('the course has already started, can I still enroll?')

'**Yes, you can still enroll in the course even after it has started.** You are still eligible to submit the homeworks, but be aware of the deadlines for completing the final project.'

In [42]:

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [43]:

from elasticsearch import Elasticsearch

In [55]:

es_client = Elasticsearch('http://localhost:9200') 
# es_client = Elasticsearch('https://potential-space-disco-r47vgrjr9gg7cxxj7-9200.app.github.dev/') 

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [57]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [63]:
from tqdm.auto import tqdm

In [64]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████| 948/948 [00:19<00:00, 47.95it/s]


In [65]:
query = 'I just disovered the course. Can I still join it?'

In [66]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [67]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [68]:
rag(query)

"Based on the provided context, it appears that you can join the course even after the start date. The course materials will be available after the course finishes, allowing you to follow the course at your own pace. Additionally, you can still submit the homeworks even if you haven't registered for the course."

In [69]:
query = 'How can I run Kafka'

In [70]:
rag(query)

'**How to run Kafka:**\n\nThe provided context does not contain information regarding how to **run Kafka itself**. The referenced answers deal with:\n\n- **Confluent Kafka:** Finding the schema registry URL.\n- **Java Kafka:** Running producer, consumer, and kstreams applications from the command line.\n\nTherefore, the requested information is not available in the given context.'