## Testing with OpenAI

In [5]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [7]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### Load documents with IDs

In [13]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [14]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

### Load ground truth

In [15]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

### Vector Model

In [17]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


### ElasticSearch function

In [18]:
index_name = "course-questions"

#### Retrieval

In [21]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

In [22]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [23]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

### The RAG Flow

In [25]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [26]:
def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [27]:
# previously: rag(query: str) -> str
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [28]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [29]:
rag(ground_truth[10])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'Yes, if you miss a session, it will be recorded. All sessions, including office hours, are recorded so you won’t miss any information. You can view these recordings later and also ask questions in advance for office hours, which will be addressed during the live stream. Additionally, you can ask questions anytime in Slack.'

In [31]:
rag(ground_truth[10], model='gpt-3.5-turbo')

'No, sessions are recorded so you can watch them later if you miss one.'

In [32]:
rag(ground_truth[100], model='gpt-3.5-turbo')

'Based on the information provided, the course videos will not be re-recorded, so the ones from 2021 will be used. The focus of the course and the skills being taught remained the same, and the videos are still up-to-date.'

In [33]:
for i in range(101, 106):
    answer_llm = rag(ground_truth[i], model='gpt-3.5-turbo')
    print(f"Answer question {i}: {answer_llm}")

Answer question 101: No, the course videos are not being re-recorded for this iteration. The focus of the course and the skills being taught remain the same, and the videos are up-to-date.
Answer question 102: No, there isn't much difference in the skills taught in this iteration compared to 2021. The main change is that there was one special module (BentoML) in the previous iteration of the course, but the rest of the modules are the same as in 2022. The homework for this year is different.
Answer question 103: If you didn't take the course in 2021, you can start watching the videos as the course content and skills being taught remained the same and are still up-to-date. It is recommended to use Python 3.10 instead of Python 3.8.
Answer question 104: Based on the context provided, Python 3.10 is the recommended version for the 2023 iteration of the course.
Answer question 105: Based on the context provided, when posting about what you learned from the course on social media, you shoul

In [34]:

for i in range(101, 106):
    answer_llm = rag(ground_truth[i], model='gpt-4o-mini')
    print(f"Answer question {i}: {answer_llm}")

Answer question 101: No, the course videos are not being re-recorded for this iteration. The videos from the previous iteration are still up-to-date, and the focus of the course and the skills being taught remain the same.
Answer question 102: Yes, there is a difference in the context of the skills taught in this iteration compared to 2021. While the focus of the course and the skills we want to teach remained the same, the course videos from the previous iterations (including 2021) will not be re-recorded. The skills taught have been consistent, but improvements in recommended tools, like using Python 3.10 instead of Python 3.8, have been suggested for the current iteration.
Answer question 103: Yes, you should watch the videos even if you didn't take the course in 2021. The videos are from the previous iteration, but they are still up-to-date and useful for learning new things.
Answer question 104: The recommended Python version for this course iteration is Python 3.10.
Answer questi