### Step 1: GroundTruth Dataset Preparation

In [4]:
import json

In [5]:
with open("../data/documents.json","rt") as file_in:
    raw_documents = json.load(file_in)

In [7]:
documents = []


for course in raw_documents:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)


documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

### Step 2: Generate documents unique id

In [10]:
import hashlib

In [11]:
def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}{doc['text']}[:10]"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [14]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '162ef410'}

In [64]:
with open("documents_id.json","wt") as d_out:
    json.dump(documents,d_out)

### Step 3: Generate user question chatgpt

In [16]:

prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [17]:
from openai import OpenAI

o_client = OpenAI(api_key="")


In [22]:
doc = documents[1]


prompt = prompt_template.format(**doc)

print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - What are the prerequisites for this course?
answer: GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [23]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = o_client.chat.completions.create(
        model= 'gpt-4o',
        messages=[{"role":"user","content":prompt}]
    )

    json_response = response.choices[0].message.content

    return json_response

In [25]:
doc = documents[1]

print(generate_questions(doc))

[
    "Where can I find the necessary background knowledge for this course?",
    "What prior experience is required to enroll in the course?",
    "What skills do I need before starting this course?",
    "Where are the details about the course prerequisites located?",
    "What foundational knowledge is recommended before taking this class?"
]


In [26]:
from tqdm.auto import tqdm

In [29]:
results = {}


for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions


100%|██████████| 948/948 [32:00<00:00,  2.03s/it]  


In [30]:
import pickle


In [31]:
with open('result.bin', 'wb') as file_out:
    pickle.dump(results, file_out)

In [32]:
results

{'23cb47db': '[\n    "On what date and time does the course commence?",\n    "How can I subscribe to the course\'s public Google Calendar?",\n    "What should I do to participate in the course before it begins?",\n    "Where can I find announcements for the course?",\n    "How can I join the course discussions on Slack?"\n]',
 '8d36ba77': '[\n  "What should I complete before enrolling in this course?",\n  "Where can I find the necessary skills needed before starting the course?",\n  "What prior knowledge is required for this course?",\n  "Is there a list of prerequisites available for the course?",\n  "How can I check if I meet the course entry requirements?"\n]',
 'bc44dd09': '[\n    "Is it possible to enroll in the course after it begins?",\n    "Can I participate in the course without registering by the start date?",\n    "Am I eligible to submit homeworks if I join late?",\n    "Do I face deadlines if I join after the course starts?",\n    "Will I be able to submit my work even if 

In [33]:
with open('result.bin', 'rb') as file_in:
    loaded_result = pickle.load(file_in)

In [50]:
loaded_result["3bfef890"] = """[
    "What can cause an error when a machine learning model expects a numerical input but receives a string like 'Nissan'?",
    "How can I resolve the issue of a model trying to convert a string, such as a car brand, into a numerical value?",
    "What method can be used to encode categorical variables like car brands into numerical values for machine learning purposes?",
    "Could you provide an example of how to perform one-hot encoding on a DataFrame containing car brands using pandas?",
    "When using pandas, what function helps create binary columns for each category/label in a column with brand names?"
]"""

In [51]:
with open('result_1.bin', 'wb') as file_out:
    pickle.dump(loaded_result, file_out)

In [52]:
with open('result_1.bin', 'rb') as file_in:
    loaded_result_1 = pickle.load(file_in)

In [53]:
loaded_result_1['3bfef890']

'[\n    "What can cause an error when a machine learning model expects a numerical input but receives a string like \'Nissan\'?",\n    "How can I resolve the issue of a model trying to convert a string, such as a car brand, into a numerical value?",\n    "What method can be used to encode categorical variables like car brands into numerical values for machine learning purposes?",\n    "Could you provide an example of how to perform one-hot encoding on a DataFrame containing car brands using pandas?",\n    "When using pandas, what function helps create binary columns for each category/label in a column with brand names?"\n]'

In [55]:
parsed_results = {}

for doc_id, json_questions in tqdm(loaded_result_1.items()):
    parsed_results[doc_id] = json.loads(json_questions)

100%|██████████| 948/948 [00:00<00:00, 265168.40it/s]


In [57]:
doc_index  = {d['id']: d for d in documents}

In [58]:
doc_index

{'23cb47db': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': '23cb47db'},
 '8d36ba77': {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '8d36ba77'},
 'bc44dd09': {'text': "Yes, even if you don't register, you're still eligibl

In [59]:
final_results =[]


for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q,course,doc_id))

final_results[1]

("How can I subscribe to the course's public Google Calendar?",
 'data-engineering-zoomcamp',
 '23cb47db')

In [60]:
import pandas as pd

In [61]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

df.to_csv('ground-truth-data.csv', index=False)

In [62]:
df.head()

Unnamed: 0,question,course,document
0,On what date and time does the course commence?,data-engineering-zoomcamp,23cb47db
1,How can I subscribe to the course's public Goo...,data-engineering-zoomcamp,23cb47db
2,What should I do to participate in the course ...,data-engineering-zoomcamp,23cb47db
3,Where can I find announcements for the course?,data-engineering-zoomcamp,23cb47db
4,How can I join the course discussions on Slack?,data-engineering-zoomcamp,23cb47db
