In [None]:
import openai
from openai import OpenAI
from getpass import getpass
import os



os.environ['OPENAI_API_KEY'] = getpass()
openai.api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI()

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [3]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [4]:
documents[3]


{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [5]:
from collections import defaultdict

In [6]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [7]:
len(hashes), len(documents)

(947, 948)

In [8]:
hashes

defaultdict(list,
            {'c02e79ef': [{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
               'section': 'General course-related questions',
               'question': 'Course - When will the course start?',
               'course': 'data-engineering-zoomcamp',
               'id': 'c02e79ef'}],
             '1f6520ca': [{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
               'section': 'General course-related questions',
               'question': 'Course - What are the prerequisites for this course?',
               

In [9]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [10]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [11]:
import json

In [12]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [13]:
!head documents-with-ids.json


[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [30]:
prompt_template = """
You emulate  a student who's taking our course. Formulate 5 questions this student might ask based on FAQ record. 
The record should contain the answer to the questions, and the question should be completes and not too short.
if Possible, use as fewer words as possible from the record.

The record:
section: {section}
question: {question}
answer:  {text}

Provide the outpout in parsable JSON without using code blocks:

["question1", "question2",...."question5"]

""".strip()

In [31]:
doc = documents[3]
prompt = prompt_template.format(**doc)

In [32]:
prompt

'You emulate  a student who\'s taking our course. Formulate 5 questions this student might ask based on FAQ record. \nThe record should contain the answer to the questions, and the question should be completes and not too short.\nif Possible, use as fewer words as possible from the record.\n\nThe record:\nsection: General course-related questions\nquestion: Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?\nanswer:  You don\'t need it. You\'re accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.\n\nProvide the outpout in parsable JSON without using code blocks:\n\n["question1", "question2",...."question5"]'

In [33]:
prompt = prompt_template.format(**doc)

response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

json_response = response.choices[0].message.content

In [34]:
json_response

'["When will I receive the confirmation email for the course?", "Do I need to wait for any confirmation email before starting?", "Is there a registered list I need to be on?", "Can I start submitting homework without registering?", "What is the purpose of registration?"]'

In [35]:
json.loads(json_response)

['When will I receive the confirmation email for the course?',
 'Do I need to wait for any confirmation email before starting?',
 'Is there a registered list I need to be on?',
 'Can I start submitting homework without registering?',
 'What is the purpose of registration?']

In [41]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content":prompt}]
    )
    json_response = response.choices[0].message.content
    return json_response

In [42]:
from tqdm.auto import tqdm

results = {}


In [43]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [44]:
results

{'c02e79ef': '["When will the course start?",\n"How do I subscribe to the course calendar?",\n"When should I register for the course?",\n"How can I join the course Telegram channel?",\n"Should I join the Slack channel for the course?"]',
 '1f6520ca': '["What prerequisites does this course have?", "Where can I find the prerequisites for this course?", "Do I need prior knowledge before starting this course?", "Is there a specific GitHub link for course prerequisites?", "Are prerequisites listed on the DataTalksClub GitHub page?"]',
 '7842b56a': '["Can I join the course after it starts?", "Is late registration allowed?", "Can I submit homework without registration?", "Are there deadlines for final projects?", "Can I leave everything until the last minute?"]',
 '0bbf41ec': '["Is it necessary to wait for a confirmation email after registering for the Data Engineering Bootcamp?",\n"Can I start learning and submitting homework without official registration?",\n"Do I need to verify if I\'m on 

In [45]:
results['1f6520ca']

'["What prerequisites does this course have?", "Where can I find the prerequisites for this course?", "Do I need prior knowledge before starting this course?", "Is there a specific GitHub link for course prerequisites?", "Are prerequisites listed on the DataTalksClub GitHub page?"]'

In [64]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

JSONDecodeError: Expecting ',' delimiter: line 3 column 68 (char 157)

In [70]:
import json

parsed_results = {}

for doc_id, json_questions in results.items():
    try:
        # Print the JSON string before attempting to parse it
        # print(f"Parsing JSON for doc_id {doc_id}: {json_questions}")
        parsed_results[doc_id] = json.loads(json_questions)
    except json.JSONDecodeError as e:
        # Print the error and the problematic JSON string
        print(f"JSONDecodeError for doc_id {doc_id}: {e}")
        print(f"Problematic JSON string: {json_questions}")

JSONDecodeError for doc_id b36ea564: Expecting ',' delimiter: line 3 column 68 (char 157)
Problematic JSON string: [
"How can I resolve being stuck on the password prompt when using PGCLI with Postgres?",
"How do I fix the 'FATAL: password authentication failed for user "root"' error in PGCLI?",
"What should I do if my Bash prompt is stuck on the password command for Postgres in Windows?",
"Which alternative terminals can I use if PGCLI is stuck on the password prompt?",
"What steps should I take if I frequently encounter 'password authentication failed for user "root"' despite entering the correct password?" 
]
JSONDecodeError for doc_id c91ad8f2: Invalid \escape: line 2 column 106 (char 185)
Problematic JSON string: ["How do I fix the error column c.relhasoids does not exist when using PGCLI?",
"What's the solution if I encounter the error column c.relhasoids does not exist while using the command \d <database name>?",
"PGCLI error says c.relhasoids does not exist. What should I do?"

In [71]:
results['c91ad8f2']

'["How do I fix the error column c.relhasoids does not exist when using PGCLI?",\n"What\'s the solution if I encounter the error column c.relhasoids does not exist while using the command \\d <database name>?",\n"PGCLI error says c.relhasoids does not exist. What should I do?",\n"What\'s the troubleshooting step for the error column c.relhasoids does not exist in PGCLI?",\n"How can I resolve the c.relhasoids does not exist error in PGCLI?"]'

In [72]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

NameError: name 'doc_index' is not defined

In [67]:
import pandas as pd

In [68]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

NameError: name 'final_results' is not defined