# LLM Zoomcamp 2025_HW1

In [48]:
import json
import requests 

import elasticsearch
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

import tiktoken

## Upload data

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Q3

In [5]:
es_client = Elasticsearch('http://localhost:9200')

In [6]:
es_client.info()

ObjectApiResponse({'name': 'd99867608567', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'MhU38PtyQRKA0ihe8B_WHg', 'version': {'number': '9.0.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '0a58bc1dc7a4ae5412db66624aab968370bd44ce', 'build_date': '2025-05-28T10:06:37.834829258Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [7]:
index_settings1 = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": { #see types
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name1 = 'specific_questions'

In [8]:
es_client.indices.create(index = index_name1,
                         body=index_settings1)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'specific_questions'})

In [9]:
for doc in tqdm(documents):
    es_client.index(index=index_name1, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 229.18it/s]


In [10]:
query1 = "How do execute a command on a Kubernetes pod?"

In [11]:
search_query1 = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query1,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
}

In [12]:
response1 = es_client.search(index=index_name1, body=search_query1)

In [13]:
result_docs1 = []
for hit in response1['hits']['hits']:
    result_docs1.append(hit['_source'])

In [14]:
response1['hits']['hits']

[{'_index': 'specific_questions',
  '_id': 'caH7epcBxdvdwvJ8GplQ',
  '_score': 44.50556,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'specific_questions',
  '_id': 'AKH7epcBxdvdwvJ8HJp7',
  '_score': 35.433445,
  '_source': {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
   'section': '10. Kubernetes and TensorFlow Serving',
   'question': 'Kubernetes-dashboard',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'specific_questions',
  '_id': 'kaH7epcBxdvdwvJ8GpnP',
  '_score': 33.70974,
  '_source': {'text':

In [15]:
result_docs1

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
  'section': '10. Kubernetes and TensorFlow Serving',
  'question': 'Kubernetes-dashboard',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t

## Q4

In [16]:
query2 = "How do copy a file to a Docker container?"

In [17]:
search_query2 = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query1,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [18]:
response2 = es_client.search(index=index_name1, body=search_query2)

In [19]:
result_docs2 = []
for hit in response2['hits']['hits']:
    result_docs2.append(hit['_source'])

In [23]:
result_docs2[2]['question']

'How do I copy files from a different folder into docker container’s working directory?'

## Q5

In [21]:
context_template = """
Q: {question}
A: {text}
""".strip()

In [37]:
context_entries = []
for res in result_docs2:
    context_entries.append(res["text"])
    
context = "\n\n".join(context_entries)

In [38]:
context

'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\nDeploy and Access the Kubernetes Dashboard\nLuke\n\nYou can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t\t\t\t\t\t\tGopakumar Gopinathan\n\nProblem description:\nI started a web-server in terminal (command window, powershell, etc.). How can I run another python script, which makes a request to this server?\nSolution description:\nJust open another terminal (command window, powershell, etc.) and run a python scri

In [28]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [45]:
prompt = prompt_template.format(
    question='How do copy a file to a Docker container?',
    context=context
)

In [46]:
prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: How do copy a file to a Docker container?\n\nCONTEXT:\nLaunch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\nDeploy and Access the Kubernetes Dashboard\nLuke\n\nYou can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t\t\t\t\t\t\tGopakumar Gopinathan\n\nProblem description:\nI started a web-se

In [47]:
len(prompt)

1665

## Q6

In [49]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [50]:
tokens = encoding.encode(prompt)
num_tokens = len(tokens)
print(num_tokens)

382
