In [20]:
# Import Libraries

import os
import json
import hashlib
from collections import defaultdict
import openai
from openai import OpenAI
from tqdm.auto import tqdm
import pandas as pd

In [2]:
# Load JSON file

documents = []
with open('../data/data.json', 'r') as file:
    for line in file:
        documents.append(json.loads(line))

In [3]:
documents[5]

{'question': 'What is (are) Acquired Cystic Kidney Disease ?',
 'answer': "Acquired cystic kidney disease happens when a person's kidneys develop fluid-filled sacs, called cysts, over time. Acquired cystic kidney disease is not the same as polycystic kidney disease (PKD), another disease that causes the kidneys to develop multiple cysts.\n                \nAcquired cystic kidney disease occurs in children and adults who have\n                \n- chronic kidney disease (CKD)a condition that develops over many years and may lead to end-stage kidney disease, or ESRD. The kidneys of people with CKD gradually lose their ability to filter wastes, extra salt, and fluid from the blood properly.  - end-stage kidney diseasetotal and permanent kidney failure that requires a kidney transplant or blood-filtering treatments called dialysis.\n                \nThe cysts are more likely to develop in people who are on kidney dialysis. The chance of developing acquired cystic kidney disease increases w

In [4]:
def generate_document_id(doc):
    combined = f"{doc['question']}-{doc['answer']}-{doc['source']}-{doc['focus_area'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
# Generate IDs for each document

for doc in documents:
    doc['id'] = generate_document_id(doc)

documents[5]

{'question': 'What is (are) Acquired Cystic Kidney Disease ?',
 'answer': "Acquired cystic kidney disease happens when a person's kidneys develop fluid-filled sacs, called cysts, over time. Acquired cystic kidney disease is not the same as polycystic kidney disease (PKD), another disease that causes the kidneys to develop multiple cysts.\n                \nAcquired cystic kidney disease occurs in children and adults who have\n                \n- chronic kidney disease (CKD)a condition that develops over many years and may lead to end-stage kidney disease, or ESRD. The kidneys of people with CKD gradually lose their ability to filter wastes, extra salt, and fluid from the blood properly.  - end-stage kidney diseasetotal and permanent kidney failure that requires a kidney transplant or blood-filtering treatments called dialysis.\n                \nThe cysts are more likely to develop in people who are on kidney dialysis. The chance of developing acquired cystic kidney disease increases w

In [6]:
# Create a defaultdict to store documents by ID
hashes = defaultdict(list)

# Group documents by their ID
for index,doc in enumerate(documents):
    doc_id = doc['id']
    hashes[doc_id].append(doc)

len(hashes), len(documents)

(292, 292)

In [7]:
with open('../data/data-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [8]:
# Generate the prompt 

prompt_template = """
You emulate a user of our health assistant application.
Formulate 5 questions this user might ask based on a disease.
Make the questions specific to this disease.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

question: {question}
answer: {answer}
source: {source}
focus_area: {focus_area}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [9]:
prompt = prompt_template.format(**documents[5])

In [None]:
api_key = os.getenv("OPENAI_API_KEY")

In [11]:
# Initializing OPENAI client
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
questions = llm(prompt)
json.loads(questions)

{'questions': ['What does acquired cystic kidney disease entail in terms of kidney function and the development of cysts?',
  'How does acquired cystic kidney disease differ from polycystic kidney disease?',
  'What are the primary causes that lead to the development of acquired cystic kidney disease?',
  'How does the duration of dialysis treatment influence the risk of developing acquired cystic kidney disease?',
  'What are the potential outcomes for individuals diagnosed with end-stage kidney disease in relation to acquired cystic kidney disease?']}

In [14]:
# Function that sends a request to the OpenAI API to generate questions based on the formatted prompt.

def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [15]:
results = {}

In [16]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/292 [00:00<?, ?it/s]

In [17]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [18]:
final_results[5]

('c0c79b2c',
 'What are the main characteristics associated with 22q11.2 deletion syndrome?')

In [21]:
# Save the results to a csv 

df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [26]:
!tail ../data/ground-truth-retrieval.csv

12e7a764,What specific mutations are responsible for the complete form of X-linked congenital stationary night blindness?
12e7a764,How do the NYX and CACNA1F proteins function within the retina?
12e7a764,"What roles do rods and cones play in vision, particularly in relation to X-linked congenital stationary night blindness?"
12e7a764,How does the transmission of visual signals get disrupted in patients with this condition?
12e7a764,What are the differences in the effects on rods and cones between the complete and incomplete forms of X-linked congenital stationary night blindness?
3dfff330,What is the prevalence of Zollinger-Ellison Syndrome in the general population?
3dfff330,Are there specific age groups or demographics that are more likely to develop Zollinger-Ellison Syndrome?
3dfff330,Does having a family history of MEN1 increase the risk of Zollinger-Ellison Syndrome?
3dfff330,Is Zollinger-Ellison Syndrome more common in men or women?
3dfff330,What is the estimated occurrence of Z