# Introduction to LLM and RAG

In [25]:
import json

import minsearch

In [26]:
with open('data/documents.json', 'r') as f:
    docs_raw = json.load(f)

In [27]:
documents = []
course_names = set()

for course in docs_raw:
        course_name = course['course']
        course_names.add(course_name)
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)

print('Total documents:', len(documents))
print('Courses:', course_names)

Total documents: 948
Courses: {'data-engineering-zoomcamp', 'machine-learning-zoomcamp', 'mlops-zoomcamp'}


## Working with minsearch library

In [28]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)
index.fit(documents)

<minsearch.minsearch.Index at 0x131e61690>

In [29]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'machine-learning-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [30]:
question_data = 'the course has already started, can I still enroll?'

In [31]:
results = search(question_data)
results[0]['text']

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

## Working with OpenAI API

In [32]:
import os

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_ENDPOINT = os.getenv("OPENROUTER_ENDPOINT")
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL")

client = OpenAI(
  base_url=OPENROUTER_ENDPOINT,
  api_key=OPENROUTER_API_KEY,
)

In [33]:
def replace_placeholders(template_path, context_content, question_content):
    try:
        with open(template_path, "r") as f:
            prompt = f.read()

        prompt = prompt.replace("{{CONTEXT}}", context_content)
        prompt = prompt.replace("{{QUESTION}}", question_content)

        return prompt
    except FileNotFoundError:
        print(f"Error: Template file not found at {template_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def prepare_prompt(question):
    context_data = ""
    results = search(question)
    if not results:
        return None

    for doc in results:
        context_data = context_data + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = replace_placeholders("prompt_assistant.txt", context_data, question)

    return prompt

def rag(question):
    prompt = prepare_prompt(question)
    if not prompt:
        print("Unable to find information about the question.")

    response = client.chat.completions.create(
        model=OPENROUTER_MODEL,
        messages=[
            {"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [34]:
question_data = 'How do I get started with Week 2'
answer = rag(question_data)
print(answer)

<answer>
Hello there!

To get started with Week 2, which began on September 18, 2023, here are some crucial links:

*   **Ask questions for Live Sessions:** https://app.sli.do/event/vsUpjYsayZ8A875Hq8dpUa/live/questions
*   **Calendar for weekly meetings:** https://calendar.google.com/calendar/u/0/r?cid=cGtjZ2tkbGc1OG9yb2lxa2Vwc2g4YXMzMmNAZ3JvdXAuY2FsZW5kYXIuZm9vZ2xlLmNvbQ&pli=1
*   **Week 2 Homework:** https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/02-regression/homework.md
*   **Submit HW Week 2:** https://docs.google.com/forms/d/e/1FAIpQLSf8eMtnErPFqzzFsEdLap_GZ2sMih-H-Y7F_IuPGqt4fOmOJw/viewform (This link is also available at the bottom of the Week 2 HW link)
*   **All Homeworks:** https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/
*   **GitHub for theory:** https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp
*   **YouTube Link (2.X):** https://www.youtube.com/watch?v=vM3SqPNlStE&list=P

## Q1. Running Elastic

In [43]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

es_client = Elasticsearch("http://localhost:9200")

In [44]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

In [45]:
# es_client.indices.create(index=index_name, body=index_settings)
for doc in tqdm(documents):
    # Q2 Which function do you use for adding your data to elastic?
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 556.04it/s]


In [46]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


query = 'I just disovered the course. Can I still join it?'

In [47]:
def prepare_prompt(question):
    context_data = ""
    results = elastic_search(question)
    if not results:
        return None
    for doc in results:
        context_data = context_data + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = replace_placeholders("prompt_assistant.txt", context_data, question)

    return prompt


def rag(question):
    prompt = prepare_prompt(question)
    if not prompt:
        print("Unable to find information about the question.")

    response = client.chat.completions.create(
        model=OPENROUTER_MODEL,
        messages=[
            {"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [48]:
question_data = 'The course has already started, can I still enroll?'
answer = rag(question_data)
print(answer)

<answer>
Hi there!

Yes, you can still join the course even after it has started. You are eligible to submit homeworks even if you don't officially register. However, please be aware that there will be deadlines for turning in the final projects, so it's a good idea not to leave everything until the last minute.

Hope this helps! Let us know if you have any other questions.
</answer>


## Q3. Searching

In [49]:
question_data = 'How do execute a command on a Kubernetes pod?'

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": question_data,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)

print(response['hits']['hits'][0]['_score'])

44.50556


In [50]:
question_data = 'How do copy a file to a Docker container?'

search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": question_data,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)

print(response['hits']['hits'][-1]['_source']['question'])

How do I copy files from a different folder into docker container’s working directory?


## Q5. Building a prompt

In [51]:
query = 'How do copy a file to a Docker container?'

context_template = """
Q: {question}
A: {text}
""".strip()


prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

context = ""
for hit in response["hits"]["hits"]:
    hit = hit["_source"]
    context += context_template.format(question=hit['question'],
                                      text=hit['text']) + "\n\n"
prompt = prompt_template.format(question=query, context=context)
len(prompt.strip())

1446

## Q6. Tokens

In [57]:
import tiktoken

def count_tokens(text, model="gpt-4o"):
  encoding = tiktoken.encoding_for_model(model)
  tokens = encoding.encode(text)
  return len(tokens)

count_tokens(prompt)

321

## Bonus: generating the answer (ungraded)

In [58]:
prompt_reuse = prompt

completion = client.chat.completions.create(
    model=OPENROUTER_MODEL,
    messages=[
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": prompt_reuse
            }
          ]
        }
      ]
)

content_reuse = completion.choices[0].message.content
print(content_reuse)

You can copy a file to a Docker container using the `docker cp` command. The basic syntax is:

`docker cp /path/to/local/file_or_directory container_id:/path/in/container`


## Bonus: calculating the costs (ungraded)

In [59]:
GPT4O_INPUT_PRICE_PER_K_TOKENS = 0.005
GPT4O_OUTPUT_PRICE_PER_K_TOKENS = 0.015
MODEL_NAME = "gpt-4o"


def calculate_cost(input_tokens, output_tokens, input_price_per_k_tokens, output_price_per_k_tokens):
  input_cost = (input_tokens / 1000) * input_price_per_k_tokens
  output_cost = (output_tokens / 1000) * output_price_per_k_tokens
  total_cost = input_cost + output_cost
  return {
      "input_cost": input_cost,
      "output_cost": output_cost,
      "total_cost": total_cost
  }

input_tokens = count_tokens(prompt_reuse, MODEL_NAME)
output_tokens = count_tokens(content_reuse, MODEL_NAME)

single_request_cost = calculate_cost(
    input_tokens,
    output_tokens,
    GPT4O_INPUT_PRICE_PER_K_TOKENS,
    GPT4O_OUTPUT_PRICE_PER_K_TOKENS
)

print("Cost of a single request:")
print(f"  Input tokens: {input_tokens}")
print(f"  Output tokens: {output_tokens}")
print(f"  Input cost: ${single_request_cost['input_cost']:.6f}")
print(f"  Output cost: ${single_request_cost['output_cost']:.6f}")
print(f"  Total cost: ${single_request_cost['total_cost']:.6f}")

# Calculate the cost for 1000 requests
num_requests = 1000
cost_for_1000_requests = {
    "input_cost": single_request_cost['input_cost'] * num_requests,
    "output_cost": single_request_cost['output_cost'] * num_requests,
    "total_cost": single_request_cost['total_cost'] * num_requests
}

print(f"\nCost for {num_requests} requests:")
print(f"  Input cost: ${cost_for_1000_requests['input_cost']:.6f}")
print(f"  Output cost: ${cost_for_1000_requests['output_cost']:.6f}")
print(f"  Total cost: ${cost_for_1000_requests['total_cost']:.6f}")


Cost of a single request:
  Input tokens: 321
  Output tokens: 39
  Input cost: $0.001605
  Output cost: $0.000585
  Total cost: $0.002190

Cost for 1000 requests:
  Input cost: $1.605000
  Output cost: $0.585000
  Total cost: $2.190000
