In [2]:
import minsearch
import json
from openai import OpenAI
from tqdm.auto  import tqdm


from elasticsearch import Elasticsearch


In [3]:
client = OpenAI(api_key=api_key)

es_client = Elasticsearch('http://localhost:9200')

es_client.info()


ObjectApiResponse({'name': 'e359b3f82038', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'LuNImNV6RB2lHk25Z9NgRA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [14]:
q = "how do I run kafka?"


index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}


search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": q,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}


prompt_template = """
You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION: {question}

CONTEXT: {context}

""".strip()





In [5]:
def minsearch_result(query):

    with open('documents.json','rt') as f_in:
        docs_raw = json.load(f_in)

    documents = []


    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)
            

    index = minsearch.Index(
        text_fields=["question","text","section"],
        keyword_fields=['course']
        )

    index.fit(documents)



    boost = { 'question':3.0,'section':0.5}

    search_results = index.search(
            query=query,
            filter_dict={'course':'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5
        )
    
    return search_results



In [6]:
def elastic_search_result(query):

    index_name = "course-questions"

    try:
       es_client.indices.create(index=index_name,body=index_settings)
    except:
       pass
    



    with open('documents.json','rt') as f_in:
        docs_raw = json.load(f_in)

    documents = []


    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)

    for doc in tqdm(documents):
        es_client.index(index=index_name, document=doc)

    response = es_client.search(index=index_name,body=query)

    result_docs = []


    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [7]:
def create_context(search_results):
    context = ""


    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context

In [8]:
def  create_prompt(context,question,prompt_template):


    prompt = prompt_template.format(question=question,context=context).strip()

    return prompt

In [9]:
def get_query_result(prompt):
    response = client.chat.completions.create(model='gpt-4o',messages=[{"role":"user","content":prompt}])
    results = response.choices[0].message.content
    return results

In [10]:
def rag(query,prompt_template,search_engine=None):
    if search_engine == "minsearch":
       search_results = minsearch_result(query)
    else:
       search_results = elastic_search_result(query)

    context = create_context(search_results)
    prompt = create_prompt(context=context,question=query,prompt_template=prompt_template)
    print(prompt)
    
    return get_query_result(prompt)

In [11]:
rag(q,prompt_template,"minsearch")

You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION:  how do I run kafka?

CONTEXT: section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - 

'To run Kafka, follow these instructions depending on whether you want to run a Java or Python Kafka application:\n\n### For Java Kafka:\nIn the project directory, you can run the following command:\n```sh\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nReplace `<jar_name>` with the actual name of your JAR file.\n\n### For Python Kafka:\n1. **Create and activate a virtual environment** (only needed once):\n   ```sh\n   python -m venv env\n   source env/bin/activate  # For MacOS/Linux\n   env/Scripts/activate  # For Windows\n   ```\n\n2. **Install the necessary packages** using `requirements.txt`:\n   ```sh\n   pip install -r ../requirements.txt\n   ```\n\n3. **Activate the virtual environment** whenever needed:\n   ```sh\n   source env/bin/activate  # For MacOS/Linux\n   env/Scripts/activate  # For Windows\n   ```\n\n4. **Deactivate the virtual environment** after you are done:\n   ```sh\n   deactivate\n   ```\n\nEnsure all Docker 

In [15]:
rag(search_query,prompt_template)

100%|██████████| 948/948 [00:01<00:00, 658.03it/s]


You're a course teaching assistant. Answer the question based on the context from the FAQ database.
Use only the facts from the CONTEXT when answering the Question.

QUESTION: {'size': 5, 'query': {'bool': {'must': {'multi_match': {'query': 'how do I run kafka?', 'fields': ['question^3', 'text', 'section'], 'type': 'best_fields'}}, 'filter': {'term': {'course': 'data-engineering-zoomcamp'}}}}}

CONTEXT: section: Workshop 1 - dlthub
question: How do I install the necessary dependencies to run the code?
answer: Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).

section: Workshop 1 - dlthub
question: How do I install the necessary dependencies to run the code?
answer: Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. 

'In the context of the Data Engineering Zoomcamp, here is how you can run Kafka:\n\nFor Java-based Kafka, to run producer/consumer/kstreams/etc in the terminal, navigate to your project directory and execute the following command:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nMake sure to replace `<jar_name>` with the actual name of your jar file.'