In [1]:
from dotenv import load_dotenv
import openai
import requests
from openai import OpenAI
import json
from pprint import pprint
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
load_dotenv() 

True

## Data Loadiing Part

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
pprint(documents[1])

{'course': 'data-engineering-zoomcamp',
 'question': 'Course - What are the prerequisites for this course?',
 'section': 'General course-related questions',
 'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'}


In [4]:

es_client = Elasticsearch(
    "http://localhost:9200",
    # forces 8-compatible headers
)

In [5]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    
}
}
index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [7]:
query = 'How do copy a file to a Docker container?'



In [12]:
def elastic_search(query):
    search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
        },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    search_response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in search_response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

### Retriving seaarch results(brute forced only top-3 results based on score here. Now to add the results as a context, question and prompt to the LLM

In [None]:
def build_prompt(query, search_results):
    template = (
        "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT "
        "from the FAQ database. Use only the facts from the CONTEXT when answering.\n\n"
        "QUESTION: {question}\n\nCONTEXT:\n{context}"
    )

    # Efficiently assemble the context blocks
    blocks = [
        f"section: {doc['section']}\n"
        f"question: {doc['question']}\n"
        f"answer: {doc['text']}"
        for doc in search_results
    ]
    context = "\n\n".join(blocks)

    return template.format(question=query, context=context)

In [21]:
client = OpenAI()
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    pprint(prompt)
    print(len(prompt))
    answer = llm(prompt)
    return answer
prompt= rag(query)
prompt

("You're a course teaching assistant. Answer the QUESTION based on the CONTEXT "
 'from the FAQ database. Use only the facts from the CONTEXT when answering.\n'
 '\n'
 'QUESTION: How do copy a file to a Docker container?\n'
 '\n'
 'CONTEXT:\n'
 'section: 5. Deploying Machine Learning Models\n'
 'question: How do I debug a docker container?\n'
 'answer: Launch the container image in interactive mode and overriding the '
 'entrypoint, so that it starts a bash command.\n'
 'docker run -it --entrypoint bash <image>\n'
 'If the container is already running, execute a command in the specific '
 'container:\n'
 'docker ps (find the container-id)\n'
 'docker exec -it <container-id> bash\n'
 '(Marcos MJD)\n'
 '\n'
 'section: 5. Deploying Machine Learning Models\n'
 'question: How do I copy files from my local machine to docker container?\n'
 'answer: You can copy files from your local machine into a Docker container '
 "using the docker cp command. Here's how to do it:\n"
 'To copy a file or di

'To copy a file from your local machine to a Docker container, you can use the `docker cp` command. The basic syntax for this operation is as follows:\n\n```\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```'

In [23]:
prompt

'To copy a file from your local machine to a Docker container, you can use the `docker cp` command. The basic syntax for this operation is as follows:\n\n```\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```'

In [18]:
## Calculating the tokens - 
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
len(encoding.encode(prompt))  # 2048 tokens

351

In [19]:
encoding.decode_single_token_bytes(63842)

b"You're"