# Ground Truth Dataset Generation For Retrieval Evaluation

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

## Assign a unique id/number to each document record

We need to make sure that each record has a unique id

In [3]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [6]:
from collections import defaultdict

In [7]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [8]:
# We see that we have some duplicates whic we can neglet for now.
len(hashes), len(documents)

(947, 948)

In [9]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [10]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

## Saving documents with their generated hashed ids

In [11]:
import json

In [12]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [13]:
!head -n 20 documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",
    "section": "General course-related questions",
    "question": "Course - What are the prerequisites for this course?",
    "course": "data-engineering-zoomcamp",
    "id": "1f6520ca"
  },
  {
    "text": "Yes, even if you don't register, you'

## Generating users' questions for each document record using the LLM

In [21]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [14]:
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

True

In [15]:
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [19]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='llama3-8b-8192',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [40]:
doc = documents[0]
doc

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [41]:
json_response = generate_questions(doc)
json_response

'Here are the 5 questions the student might ask based on the FAQ record:\n\n[\n"Can you please tell me the exact start date and time of the course?",\n"How will I receive updates about the course",\n"What is the format of the first live session of the course?",\n"Can I register for the course from any device?",\n"Are there any additional steps I need to take to fully participate in the course?"'

In [42]:
print(json_response)

Here are the 5 questions the student might ask based on the FAQ record:

[
"Can you please tell me the exact start date and time of the course?",
"How will I receive updates about the course",
"What is the format of the first live session of the course?",
"Can I register for the course from any device?",
"Are there any additional steps I need to take to fully participate in the course?"


#### Generate questions for each doc

In [43]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
results = {}

In [45]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

100%|██████████| 948/948 [49:04<00:00,  3.11s/it] 


In [46]:
results["593f7569"]

'Here are 5 questions the student might ask based on the FAQ record:\n\n[\n"How do decision trees and ensemble learning relate to each other?",\n"What is a server in the context of Python and gunicorn?",\n"Won\'t running gunicorn directly cause any issues",\n"How can I avoid overfitting in my decision tree",\n"Can I use gunicorn to run multiple Python files simultaneously"\n]'

In [47]:
print(results["593f7569"])

Here are 5 questions the student might ask based on the FAQ record:

[
"How do decision trees and ensemble learning relate to each other?",
"What is a server in the context of Python and gunicorn?",
"Won't running gunicorn directly cause any issues",
"How can I avoid overfitting in my decision tree",
"Can I use gunicorn to run multiple Python files simultaneously"
]


In [106]:
def extract_questions(input_string):
    start_token = '['
    end_token = ']'
    
    start_index = input_string.find(start_token)
    if start_index == -1:
        return None
    
    end_index = input_string.find(end_token, start_index)
    
    if end_index == -1:
        # If end token is not found, extract till the end of the string and append the closing bracket
        questions_part = input_string[start_index:] + '\n]'
        questions_part = questions_part.replace("\n", "").replace("{", "").replace("}", "").replace('"question":', "")
    else:
        # If end token is found, extract normally including the end token
        questions_part = input_string[start_index:end_index + len(end_token)].replace("\n", "").replace('"question":', "")
        questions_part = questions_part.replace("{", "").replace("}", "")
    
    return str(questions_part)


In [86]:
# Example usage
input_string = results["593f7569"]

print(extract_questions(input_string))

["How do decision trees and ensemble learning relate to each other?","What is a server in the context of Python and gunicorn?","Won't running gunicorn directly cause any issues","How can I avoid overfitting in my decision tree","Can I use gunicorn to run multiple Python files simultaneously"]


In [88]:
json.loads(extract_questions(input_string))

['How do decision trees and ensemble learning relate to each other?',
 'What is a server in the context of Python and gunicorn?',
 "Won't running gunicorn directly cause any issues",
 'How can I avoid overfitting in my decision tree',
 'Can I use gunicorn to run multiple Python files simultaneously']

In [101]:
input_string = results["1f6520ca"]
input_string


'Here are 5 questions the student might ask based on the FAQ record:\n\n[{"What are the prerequisites for the course?", "What do I need to know before starting the course?", "How can I prepare for the course?", "Do I need any prior knowledge to take this course?", "Are there any specific skills or qualifications required for the course?"}'

In [102]:
print(extract_questions(input_string))

["What are the prerequisites for the course?", "What do I need to know before starting the course?", "How can I prepare for the course?", "Do I need any prior knowledge to take this course?", "Are there any specific skills or qualifications required for the course?"]


In [103]:
json.loads(extract_questions(input_string))

['What are the prerequisites for the course?',
 'What do I need to know before starting the course?',
 'How can I prepare for the course?',
 'Do I need any prior knowledge to take this course?',
 'Are there any specific skills or qualifications required for the course?']

### Saving a results dictionary to a pickle .bin file

In [65]:
import pickle

def write_dict_to_pickle(file_path, data):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)


In [66]:
file_path = 'results.bin'
write_dict_to_pickle(file_path, results)

print(f"Results dictionary written to {file_path}")

Results dictionary written to results.bin


### Loading results dictionary from pickle file

In [125]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [126]:
results['1f6520ca']

'Here are 5 questions the student might ask based on the FAQ record:\n\n[{"What are the prerequisites for the course?", "What do I need to know before starting the course?", "How can I prepare for the course?", "Do I need any prior knowledge to take this course?", "Are there any specific skills or qualifications required for the course?"}'

In [127]:
len(results)

947

### Parsing the results dictionary

In [128]:
parsed_results = {}

for doc_id, json_questions in results.items():
    try:
        json_questions = extract_questions(json_questions)
        parsed_results[doc_id] = json.loads(json_questions)
    except Exception as e:
        print(f"Error: {str(e)}")
        print(json_questions)

Error: the JSON object must be str, bytes or bytearray, not NoneType
None
Error: Expecting ',' delimiter: line 1 column 317 (char 316)
[["Will we still be using the same NYC Trip data as last year?", "Are we switching to new data for the project?", "What data will we use for this year's project? Are we still using the 2021 data?", "Although the project remains the same, will the data be updated?", "What data is available for this year's project?"]
Error: Expecting ',' delimiter: line 1 column 51 (char 50)
[ "How can I participate in the course?", "answer": null,  "What is the way to improve the course content?", "answer": null,  "How can others benefit from the course?", "answer": null,  "Can I enhance the quality of the course repository?", "answer": null,  "What should I do if I find the course material useful?", "answer": null]
Error: Expecting value: line 1 column 274 (char 273)
["What resources do you recommend for this course?", "Can you provide any book recommendations?", "Do yo

In [129]:
len(parsed_results)

845

In [130]:
doc_index = {d['id']: d for d in documents}

In [131]:
doc_index["1f6520ca"]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [132]:
len(doc_index)

947

In [133]:
final_results = []

for doc_id, questions in parsed_results.items():
    if doc_id in doc_index:
        course = doc_index[doc_id]['course']
        for q in questions:
            final_results.append((q, course, doc_id))

In [134]:
len(final_results)

4207

In [135]:
final_results[0]

('Can you tell me when the course is starting?',
 'data-engineering-zoomcamp',
 'c02e79ef')

### Generating pandas dataframe and saving to .csv file

In [137]:
import pandas as pd

In [138]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [139]:
df.size

12621

In [140]:
df.shape

(4207, 3)

In [141]:
len(df)

4207

In [142]:
df.head(10)

Unnamed: 0,question,course,document
0,Can you tell me when the course is starting?,data-engineering-zoomcamp,c02e79ef
1,What is the exact day and hour of the course?,data-engineering-zoomcamp,c02e79ef
2,Is there a specific calendar I should subscrib...,data-engineering-zoomcamp,c02e79ef
3,Can you guide me on how to register for the co...,data-engineering-zoomcamp,c02e79ef
4,What platforms can I use to stay updated on co...,data-engineering-zoomcamp,c02e79ef
5,What are the prerequisites for the course?,data-engineering-zoomcamp,1f6520ca
6,What do I need to know before starting the cou...,data-engineering-zoomcamp,1f6520ca
7,How can I prepare for the course?,data-engineering-zoomcamp,1f6520ca
8,Do I need any prior knowledge to take this cou...,data-engineering-zoomcamp,1f6520ca
9,Are there any specific skills or qualification...,data-engineering-zoomcamp,1f6520ca


In [143]:
df.tail(10)

Unnamed: 0,question,course,document
4197,Why did the pre-commit command fail in the iso...,mlops-zoomcamp,fb3c4150
4198,What is the most recent working version of iso...,mlops-zoomcamp,fb3c4150
4199,What is the reason for the error in the pre-co...,mlops-zoomcamp,fb3c4150
4200,How did the solution help in resolving the iss...,mlops-zoomcamp,fb3c4150
4201,Who contributed to the solution of setting iso...,mlops-zoomcamp,fb3c4150
4202,What are some best practices for destroying in...,mlops-zoomcamp,886d1617
4203,How can I destroy infrastructure created in AW...,mlops-zoomcamp,886d1617
4204,Do I need to reconfigure something before dest...,mlops-zoomcamp,886d1617
4205,What command should I use from local to initia...,mlops-zoomcamp,886d1617
4206,What file should I use as a variable file when...,mlops-zoomcamp,886d1617


In [145]:
df.isnull().sum()

question    0
course      0
document    0
dtype: int64

In [146]:
null_mask = df.isnull().any(axis=1)
null_rows = df[null_mask]
null_rows.shape

(0, 3)

In [147]:
not_null_mask = df.notnull().all(axis=1)
not_null_rows = df[not_null_mask]
not_null_rows.shape

(4207, 3)

In [151]:
df.to_csv('ground-truth-data.csv', index=False)

In [152]:
!head -n 20 ground-truth-data.csv

question,course,document
Can you tell me when the course is starting?,data-engineering-zoomcamp,c02e79ef
What is the exact day and hour of the course?,data-engineering-zoomcamp,c02e79ef
Is there a specific calendar I should subscribe to for the course schedule?,data-engineering-zoomcamp,c02e79ef
Can you guide me on how to register for the course?,data-engineering-zoomcamp,c02e79ef
What platforms can I use to stay updated on course announcements?,data-engineering-zoomcamp,c02e79ef
What are the prerequisites for the course?,data-engineering-zoomcamp,1f6520ca
What do I need to know before starting the course?,data-engineering-zoomcamp,1f6520ca
How can I prepare for the course?,data-engineering-zoomcamp,1f6520ca
Do I need any prior knowledge to take this course?,data-engineering-zoomcamp,1f6520ca
Are there any specific skills or qualifications required for the course?,data-engineering-zoomcamp,1f6520ca
Can I still join the course after the start date and what are the implications on the as

In [153]:
!tail -n 20 ground-truth-data.csv

How do I manage multiple Docker containers with docker-compose when they take up too many resources?,mlops-zoomcamp,b16aae74
What happens when a docker-compose file contains a large number of containers?,mlops-zoomcamp,b16aae74
Can I only run a subset of containers from a large docker-compose file?,mlops-zoomcamp,b16aae74
How do I specify which group of containers to start when using docker-compose?,mlops-zoomcamp,b16aae74
What is the purpose of adding profiles in a service definition in docker-compose?,mlops-zoomcamp,b16aae74
Can I have more details on why AWS regions need to match in docker-compose and local config?,mlops-zoomcamp,66326a87
"What happens if my AWS regions don't match, and how can I fix it?",mlops-zoomcamp,66326a87
Can you provide an example of how to set my AWS region in the local config?,mlops-zoomcamp,66326a87
How does this issue with AWS regions affect my integration tests and kinesis?,mlops-zoomcamp,66326a87
Can I set the AWS region in the docker-compose file inst

In [155]:
df_ground_truth_from_csv = pd.read_csv('ground-truth-data.csv', skip_blank_lines=True)
len(df_ground_truth_from_csv)

264445

In [156]:
null_mask = df_ground_truth_from_csv.isnull().any(axis=1)
null_rows = df_ground_truth_from_csv[null_mask]
null_rows.shape

(260238, 3)

In [160]:
df_ground_truth_from_csv.dropna(inplace=True)
df_ground_truth_from_csv.shape

(4207, 3)