In [19]:
import minsearch
import json
import os
from tqdm.auto  import tqdm
from openai import OpenAI
import ollama
from elasticsearch import Elasticsearch


In [20]:


es_client = Elasticsearch('http://localhost:9200')

es_client.info()

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama')


ollama.pull('phi3')

{'status': 'success'}

In [4]:
q = "how do I run kafka?"


index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": q,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}


prompt_template = """
You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION: {question}

CONTEXT: {context}

""".strip()





In [5]:
def minsearch_result(query):

    with open('../../data/documents.json','rt') as f_in:
        docs_raw = json.load(f_in)

    documents = []


    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)
            

    index = minsearch.Index(
        text_fields=["question","text","section"],
        keyword_fields=['course']
        )

    index.fit(documents)



    boost = { 'question':3.0,'section':0.5}

    search_results = index.search(
            query=query,
            filter_dict={'course':'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5
        )
    
    return search_results



In [6]:
def elastic_search_result(query):

    index_name = "course-questions"

    try:
       es_client.indices.create(index=index_name,body=index_settings)
    except:
       pass
    



    with open('../../data/documents.json','rt') as f_in:
        docs_raw = json.load(f_in)

    documents = []


    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)

    for doc in tqdm(documents):
        es_client.index(index=index_name, document=doc)

    response = es_client.search(index=index_name,body=query)

    result_docs = []


    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [11]:
def create_context(search_results):
    context = ""


    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context

In [7]:
def  create_prompt(context,question,prompt_template):


    prompt = prompt_template.format(question=question,context=context).strip()

    return prompt

In [8]:
def get_query_result(prompt):
    response = client.chat.completions.create(model='phi3',messages=[{"role":"user","content":prompt}])
    results = response.choices[0].message.content
    return results

In [9]:
def rag(query,prompt_template,search_engine=None):
    if search_engine == "minsearch":
       search_results = minsearch_result(query)
    else:
       search_results = elastic_search_result(query)

    context = create_context(search_results)
    prompt = create_prompt(context=context,question=query,prompt_template=prompt_template)
    print(prompt)
    
    return get_query_result(prompt)

In [21]:
rag(q,prompt_template,"minsearch")

You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION: how do I run kafka?

CONTEXT: section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - b

" To run Kafka in Python with a virtual environment on MacOS or Linux:\n1. Create a new directory for your project, then navigate into it using the `cd` command: \n```bash\nmkdir my_kafka_project\ncd my_kafka_project\n```\n2. Inside this directory, create and activate a Python virtual environment with these commands (run only once): \n```bash\npython -m venv env\nsource env/bin/activate\n```\n3. Install necessary packages in the active environment using pip:\n```bash\npip install kafka-python requests pika Pillow psutil six PyYAML arrow sqlalch0me future # This list can grow as your project grows, but this is a good start \n```\n4. To run producer or consumer scripts for Kafka in Python within the virtual environment activate it first and execute with `python`:  \n```bash\nsource env/bin/activate\npython my_project/producer.py # Replace 'producer.py' with your actual script name \ndeactivate\n```\nPlease note that this is a simplistic approach for running Kafka-related tasks and the FA

In [22]:
rag(search_query,prompt_template)

100%|██████████| 948/948 [00:01<00:00, 766.72it/s]


You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION: {'size': 5, 'query': {'bool': {'must': {'multi_match': {'query': 'how do I run kafka?', 'fields': ['question^3', 'text', 'section'], 'type': 'best_fields'}}, 'filter': {'term': {'course': 'data-engineering-zoomcamp'}}}}}

CONTEXT: section: Workshop 1 - dlthub
question: How do I install the necessary dependencies to run the code?
answer: Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).

section: Workshop 1 - dlthub
question: How do I install the necessary dependencies to run the code?
answer: Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. 

" Answer: To run the code for Workshop 1 - dlthub on ZoomCamp, first make sure that 'dlt[duckdb]' and DuckDB are both installed. Execute !pip install dlt[duckdb] to get started with installing these packages in your environment (ZoomCamp). Additionally, ensure you have pip itself installed before adding the duckdb package as well since it is also a requirement for running this code successfully on ZoomCamp's platform."