In [130]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [131]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}"
    # combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [132]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [133]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '4e079edc'}

In [134]:
from collections import defaultdict

In [135]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [136]:
len(hashes), len(documents)

(944, 948)

In [137]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k)


ca3dc12d
960fb254
67d2f21c
297f443c


In [138]:
import json

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [139]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "5f1b0231"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [140]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [141]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv() 
api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [142]:
doc = documents[2]
prompt = prompt_template.format(**doc)

In [143]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [144]:
response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

json_response = response.choices[0].message.content
json_response

'[\n  "Is it possible to enter the course after it starts?",\n  "Can I enroll after the course has begun and still do the assignments?",\n  "Am I allowed to submit homework if I\'m not registered by the start date?",\n  "Will I face any restrictions on final project deadlines if I join late?",\n  "Can I delay registering for the course and work on assignments?"\n]'

In [146]:
# OpenAI

def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [158]:
import requests
import json
from tqdm.auto import tqdm

def generate_questions_ollama(doc, model="llama3.2"):
    prompt = prompt_template.format(**doc)
    
    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': model,
                               'prompt': prompt,
                               'stream': False,
                           })
    
    if response.status_code == 200:
        return response.json()['response']
    else:
        print(f"Error: {response.status_code}")
        return "[]"

# Generate questions
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions_ollama(doc)
    results[doc_id] = questions

print(f"Generated questions for {len(results)} documents")

  0%|          | 0/948 [00:00<?, ?it/s]

Generated questions for 944 documents


In [160]:
import pickle

# Save the results
with open('results.bin', 'wb') as f:
    pickle.dump(results, f)

print(f"Results saved to results.bin")

Results saved to results.bin


In [149]:
# Generate 5 questions for each doc

# results = {}

# from tqdm.auto import tqdm
# for doc in tqdm(documents):
#     doc_id = doc['id']
#     if doc_id in results:
#         continue
#     questions = generate_questions(doc)
#     results[doc_id] = questions

In [None]:
# with open('results.bin', 'rb') as f_in:
#     results = pickle.load(f_in)

In [164]:
len(results)

944

In [152]:
# results['58c9f99f'] = '''[
# r"How can I resolve the Docker error 'invalid mode: \Program Files\Git\\var\lib\postgresql\data'?",
# "What should I do if I encounter an invalid mode error in Docker on Windows?",
# "What is the correct mounting path to use in Docker for PostgreSQL data on Windows?",
# "Can you provide an example of a correct Docker mounting path for PostgreSQL data?",
# r"How do I correct the mounting path error in Docker for \Program Files\Git\var\lib\postgresql\data'?"
# ]'''

In [177]:
import ast
import json
import re

parsed_results = {}

def clean_and_parse_questions(json_questions):
    try:
        # First, try json.loads in case it's proper JSON array
        result = json.loads(json_questions)
        # If it's already a list, return it
        if isinstance(result, list):
            return result
        # If it's a dict, extract the values
        elif isinstance(result, dict):
            return list(result.values())
        else:
            return [str(result)]
    except json.JSONDecodeError:
        try:
            # Extract questions manually using regex - get the question text only
            questions = re.findall(r'"question\d+":\s*"([^"]+)"', json_questions)
            if questions:
                return questions
            
            # Try alternative regex patterns
            questions = re.findall(r'"([^"]*\?[^"]*)"', json_questions)
            if questions:
                return questions
                
            # If no questions found, try ast.literal_eval
            return ast.literal_eval(json_questions)
        except:
            # Last resort - split by common delimiters and clean
            lines = json_questions.split('\n')
            questions = []
            for line in lines:
                line = line.strip()
                if '?' in line and len(line) > 10:  # Likely a question
                    # Remove quotes and extra formatting
                    clean_question = re.sub(r'^["\'\s]*', '', line)
                    clean_question = re.sub(r'["\'\s]*$', '', clean_question)
                    clean_question = re.sub(r'^question\d+:\s*', '', clean_question, flags=re.IGNORECASE)
                    if clean_question:
                        questions.append(clean_question)
            return questions if questions else [json_questions]

for doc_id, json_questions in results.items():
    try:
        parsed_results[doc_id] = clean_and_parse_questions(json_questions)
    except Exception as e:
        print(f"Failed to parse doc_id {doc_id}: {e}")
        print(f"Content: {json_questions[:200]}...")
        continue

print(f"Successfully parsed {len(parsed_results)} documents")

# Check a sample to see what we got
if parsed_results:
    sample_doc_id = list(parsed_results.keys())[0]
    print(f"Sample parsed result: {parsed_results[sample_doc_id]}")
    print(f"Number of questions: {len(parsed_results[sample_doc_id])}")

Successfully parsed 944 documents
Sample parsed result: ['What is the expected start time of the course and how will it be communicated to students?', "How do I access the live 'Office Hours' session at the beginning of the course?", 'Can I attend the course from a mobile device or only from a desktop computer?', 'What is the deadline for registering before the course starts?', 'Are there any other platforms besides Google Calendar and Slack that students should be aware of to stay updated on the course?']
Number of questions: 5


In [178]:
doc_index = {d['id']:d for d in documents}

In [179]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [180]:
final_results

[('What is the expected start time of the course and how will it be communicated to students?',
  'data-engineering-zoomcamp',
  '5f1b0231'),
 ("How do I access the live 'Office Hours' session at the beginning of the course?",
  'data-engineering-zoomcamp',
  '5f1b0231'),
 ('Can I attend the course from a mobile device or only from a desktop computer?',
  'data-engineering-zoomcamp',
  '5f1b0231'),
 ('What is the deadline for registering before the course starts?',
  'data-engineering-zoomcamp',
  '5f1b0231'),
 ('Are there any other platforms besides Google Calendar and Slack that students should be aware of to stay updated on the course?',
  'data-engineering-zoomcamp',
  '5f1b0231'),
 ('What are the specific requirements I need to fulfill before starting this course?',
  'data-engineering-zoomcamp',
  '58fa8869'),
 ("Can you provide more details on what is meant by 'GitHub' in the context of course prerequisites?",
  'data-engineering-zoomcamp',
  '58fa8869'),
 ('Are there any additi

In [183]:
import pandas as pd

df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [184]:
df.to_csv('ground-truth-data.csv', index=False)

In [185]:
!head ground-truth-data.csv

question,course,document
What is the expected start time of the course and how will it be communicated to students?,data-engineering-zoomcamp,5f1b0231
How do I access the live 'Office Hours' session at the beginning of the course?,data-engineering-zoomcamp,5f1b0231
Can I attend the course from a mobile device or only from a desktop computer?,data-engineering-zoomcamp,5f1b0231
What is the deadline for registering before the course starts?,data-engineering-zoomcamp,5f1b0231
Are there any other platforms besides Google Calendar and Slack that students should be aware of to stay updated on the course?,data-engineering-zoomcamp,5f1b0231
What are the specific requirements I need to fulfill before starting this course?,data-engineering-zoomcamp,58fa8869
Can you provide more details on what is meant by 'GitHub' in the context of course prerequisites?,data-engineering-zoomcamp,58fa8869
Are there any additional tools or software that I need to install before enrolling in the course?,data-enginee