In [76]:
import os
import json
from tqdm.auto import tqdm
from openai import OpenAI
import tiktoken
from elasticsearch import Elasticsearch

import minsearch

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [8]:
index.fit(documents)

<minsearch.Index at 0x12122ba00>

In [15]:
client = OpenAI()

In [16]:
q = 'the course has already started, can I still enroll?'

response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"The ability to enroll in a course that has already started depends on the policies of the specific institution or organization offering the course. Here are a few steps you can take to find out:\n\n1. **Check the Course Registration Page**: Look for any information regarding late enrollment or deadlines. Sometimes there may be a grace period for late registrants.\n\n2. **Contact the Instructor**: Reach out to the course instructor or coordinator. Explain your situation and ask if it's possible to join the class late. Some instructors may be flexible and allow you to catch up on missed material.\n\n3. **Consult Academic Advising**: Some institutions have academic advisors or registrars who can provide guidance on enrollment policies and help you navigate late registration.\n\n4. **Review Institution Policies**: Some schools have specific policies in place for late enrollments, add/drop periods, or special permissions. Familiarize yourself with these policies.\n\n5. **Consider Course Lo

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [81]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT:
        {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"""
        Q: {doc['question']}
        A: {doc['text']}\n\n
    """
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [36]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response # .choices[0].message.content

In [20]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [21]:
rag(query)

'To run Kafka, follow the appropriate steps provided in the CONTEXT based on your specific setup (Java or Python):\n\n### For Java:\n1. Navigate to your project directory.\n2. Execute the following command in the terminal:\n    ```sh\n    java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n    ```\n\n### For Python:\n1. **Set up a Virtual Environment (if not already done):**\n    ```sh\n    python -m venv env\n    source env/bin/activate\n    pip install -r ../requirements.txt\n    ```\n   Note: For Windows, the command to activate the virtual environment is `env\\Scripts\\activate`.\n\n2. **Activate the Virtual Environment:**\n    ```sh\n    source env/bin/activate\n    ```\n\n3. If you get a `Permission denied` error when running `./build.sh`, you may need to change its permissions:\n    ```sh\n    chmod +x build.sh\n    ```\n\n4. **Run the Python Kafka producer/consumer script:**\n    - Ensure that Docker images are up and running.\n    -

In [23]:
es_client = Elasticsearch('http://localhost:9200') 

In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 506.85it/s]


In [33]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"], # section
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [37]:
def rag_elastic(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [45]:
query = 'How do I execute a command in a running docker container?'
search_results = elastic_search(query)

In [82]:
query = 'How do I execute a command in a running docker container?'
search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"], # section
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)

In [83]:
result_docs = []
    
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [84]:
result_docs

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani",
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from my local machine to docker container?',
 

In [85]:
prompt = build_prompt(query, result_docs)

In [86]:
len(prompt)

1555

In [87]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [88]:
len(encoding.encode(prompt))

334