In [35]:
import requests 
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import json
from groq import Groq
import os

In [36]:
client = Groq(api_key=os.getenv("GROQ_API_KEY"))


In [37]:
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '33794009578d', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-KNU-I60Qn-diTdA9ThUJw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [38]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [58]:
docs_raw

[{'course': 'llm-zoomcamp',
  'documents': [{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
    'section': 'General course-related questions',
    'question': 'I just discovered the course. Can I still join?',
    'course': 'llm-zoomcamp'},
   {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework (while the form is Open) without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
    'section': 'General course-related questions',
    'question': 'Course - I have registered for the [insert-zoomcamp-name]. When can I expect to receive the confirmation email?',
    'course': 'llm-zoomcamp'},
   {'text': 'The zoom link is only published to instructors/presenters/TAs.\nStudents participate via Youtube Live and submit questions to Slido (link would be pinned in the chat when Alexey goes Liv

In [68]:
with open('documents.json', 'r', encoding='utf-8') as f_in:
    documents_raw = json.load(f_in)

documents = []

# Same logic as the requests-based version
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# Optional: Print first 2 entries to verify
print(documents[:2])

[{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.', 'section': 'General course-related questions', 'question': 'I just discovered the course. Can I still join?', 'course': 'llm-zoomcamp'}, {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework (while the form is Open) without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.", 'section': 'General course-related questions', 'question': 'Course - I have registered for the [insert-zoomcamp-name]. When can I expect to receive the confirmation email?', 'course': 'llm-zoomcamp'}]


In [70]:
import json
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200")  # adjust if using cloud or different port

# Load documents from local JSON file
with open('documents.json', 'r', encoding='utf-8') as f_in:
    documents_raw = json.load(f_in)

# Flatten the documents
documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# Elasticsearch index settings
index_name = 'assignments_questions_n'
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

# Create the index only if it doesn't exist
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_settings)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

# Index each document
for i, doc in enumerate(documents):
    es.index(index=index_name, document=doc, id=i)

print(f"Indexed {len(documents)} documents into '{index_name}'")


  es.indices.create(index=index_name, body=index_settings)


Index 'assignments_questions_n' created.
Indexed 86 documents into 'assignments_questions_n'


In [63]:
documents[0]

{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
 'section': 'General course-related questions',
 'question': 'I just discovered the course. Can I still join?',
 'course': 'llm-zoomcamp'}

In [71]:
es.info()

ObjectApiResponse({'name': '33794009578d', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-KNU-I60Qn-diTdA9ThUJw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

Question 2  = index

In [72]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 86/86 [00:26<00:00,  3.24it/s]


Question 3

In [73]:
query = "How do execute a command on a Kubernetes pod?"


In [74]:
def elastic_search(query, course="machine-learning-zoomcamp", size=3):
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es.search(index=index_name, body=search_query)

    result_docs = [hit['_source'] for hit in response['hits']['hits']]

    return {
        "result": result_docs,
        "raw_response": response
    }


In [96]:
res = elastic_search(query="How do execute a command on a Kubernetes pod?", course="llm-zoomcamp")


  response = es.search(index=index_name, body=search_query)


In [99]:
res['result']


[{'text': '1. search with the model name on hugging face.\n2. get the transformer used on the model.\n3. using the transformer, encode the string you want.\n4. calculate the length of the outputted tensor.\nThe previous code snippet uses the tokenizer of google/gemma-2b LLM. \nDon’t forget to make your token secret.\nAdded by kamal',
  'section': 'Module 2: Open-Source LLMs',
  'question': 'HuggingFace: How to get the number of tokens in a certain string related to a certain model on hugging face?',
  'course': 'llm-zoomcamp'},
 {'text': '1. search with the model name on hugging face.\n2. get the transformer used on the model.\n3. using the transformer, encode the string you want.\n4. calculate the length of the outputted tensor.\nThe previous code snippet uses the tokenizer of google/gemma-2b LLM. \nDon’t forget to make your token secret.\nAdded by kamal',
  'section': 'Module 2: Open-Source LLMs',
  'question': 'HuggingFace: How to get the number of tokens in a certain string related

QUESTION 5


In [89]:
context_template = """
Q: {question}
A: {text}
""".strip()

context_template

'Q: {question}\nA: {text}'

In [90]:
context_template = """
Q: {question}
A: {text}
""".strip()

context_entries = []
for record in res['result']:
    context_entries.append(context_template.format(question=record["question"], text=record["text"]))

# Join all context entries with two line breaks
context = "\n\n".join(context_entries)

# Print the final context
print(context)

Q: Docker: How to inspect the content of a file inside a Docker container ?
A: You have several ways to inspect the content of a file when you are inside a Docker container.
First, make sure you ran the docker container interactively using bash:
docker exec -it <container> bash
Then, you are able to use bash commands. For this case, I propose two solutions:
Use “cat” and the file you want to see the content: cat your_file . This will directly print the content in your terminal.
Install vim or nano using apt get and open the file using vim or nano (this can be more suitable for larger files):
apt-get install vim
vim your_file
Then, you can exit your file in vim by pressing ESC then typing “:q” and finally press ENTER
Added by Mélanie Fouesnard

Q: Docker: How to inspect the content of a file inside a Docker container ?
A: You have several ways to inspect the content of a file when you are inside a Docker container.
First, make sure you ran the docker container interactively using bash:


In [91]:
""""
The question asks us to use this query: {"How do I execute a command in a running docker container?"}, 
but the exercise does not give the correct answer except we use the previous query: {"How do copy a file to a Docker container?"}.
This should be corrected in the document.
"""
q = "How to install docker container?"

In [92]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [93]:
prompt = prompt_template.format(question=query, context=context)

print(f"prompt length is: {len(prompt)}")

prompt length is: 2262
