In [1]:
from openai import OpenAI
from prompts import *
import json

In [None]:
# Retrieve API key
with open("../api_keys.json", "r") as file:
    api_keys = json.load(file)

In [4]:
api_key = api_keys.get("Llama")

client = OpenAI(
    api_key = api_key,
    base_url = "https://api.llama-api.com"
)

In [10]:
print(advanced_developer_message)


You are generating synthetic healthcare data for a research project aiming to evaluate the realism and utility of synthetic datasets. The data must adhere to the following schema and meet the highest standards of quality, diversity, and logical coherence:

Schema:
- Name: string (e.g., "Alex Smith"; ensure diversity across names)
- Age: integer (0-100; follow a realistic demographic distribution, e.g., more adults than children)
- Gender: categorical (values: "male", "female", "other"; reflect real-world proportions)
- Blood Type: categorical (values: "A", "B", "AB", "O"; realistic distribution based on population statistics)
- Medical Condition: string (e.g., "diabetes", "flu", "hypertension"; include a mix of common and rare conditions)
- Date of Admission: string (format: "YYYY-MM-DD"; ensure temporal coherence)
- Doctor: string (e.g., "Dr. Patel"; match specialization to Medical Condition)
- Hospital: string (e.g., "Metropolitan Health Center"; maintain logical variety in hospital

In [13]:
print(basic_prompts[0])

Generate 10 rows of synthetic healthcare data.


In [19]:

full_data = []
for seed in range(100):
    response = client.chat.completions.create(
        model="llama3.1-70b",
        store=True,
        messages=[
                {
                    "role": "developer",
                    "content": advanced_developer_message
                },
                {
                    "role": "user",
                    "content": basic_prompts[0]
                }
            ],
        seed=seed
        
    )
    full_data.append(response)


# print(response.model_dump_json(indent=2))
# print('-----------')
# print(response.choices[0].message.content)

In [21]:
import os

# Define the folder and file paths
folder_path = "../full_data"
file_name = "llama_advanced_dev_message_w_basic_prompt_records.txt"
file_path = os.path.join(folder_path, file_name)

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Write to the file
with open(file_path, "w") as file:
    for item in full_data:
        file.write(f"{item}\n")

print(f"File created at: {file_path}")

File created at: ../full_data/llama_advanced_dev_message_w_basic_prompt_records.txt


In [20]:
len(full_data)

100

In [24]:
full_data[0]

ChatCompletion(id=None, choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 10 rows of synthetic healthcare data that adhere to the provided schema and meet the requirements for diversity, realism, and logical coherence:\n\n```\n[\n  {\n    "Name": "Emily Chen",\n    "Age": 32,\n    "Gender": "female",\n    "Blood Type": "A",\n    "Medical Condition": "hypertension",\n    "Date of Admission": "2022-01-15",\n    "Doctor": Dr. Lee (Cardiology),\n    Hospital', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1736266934, model='llama3.1-70b', object=None, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=142, prompt_tokens=64, total_tokens=206, completion_tokens_details=None, prompt_tokens_details=None))

In [22]:
type(full_data[0])

openai.types.chat.chat_completion.ChatCompletion

In [25]:
def chatcompletion_to_dict(chat_completion):
    """Recursively convert a ChatCompletion object into a JSON-serializable dictionary."""
    if isinstance(chat_completion, list):
        return [chatcompletion_to_dict(item) for item in chat_completion]
    elif hasattr(chat_completion, "__dict__"):  # For objects with attributes
        return {
            key: chatcompletion_to_dict(value)
            for key, value in vars(chat_completion).items()
            if value is not None  # Skip None values
        }
    elif isinstance(chat_completion, dict):  # Handle nested dictionaries
        return {key: chatcompletion_to_dict(value) for key, value in chat_completion.items()}
    else:
        return chat_completion  # Return base types as-is
    
test = chatcompletion_to_dict(full_data[0])
test

{'choices': [{'finish_reason': 'length',
   'index': 0,
   'message': {'content': 'Here are 10 rows of synthetic healthcare data that adhere to the provided schema and meet the requirements for diversity, realism, and logical coherence:\n\n```\n[\n  {\n    "Name": "Emily Chen",\n    "Age": 32,\n    "Gender": "female",\n    "Blood Type": "A",\n    "Medical Condition": "hypertension",\n    "Date of Admission": "2022-01-15",\n    "Doctor": Dr. Lee (Cardiology),\n    Hospital',
    'role': 'assistant'}}],
 'created': 1736266934,
 'model': 'llama3.1-70b',
 'usage': {'completion_tokens': 142, 'prompt_tokens': 64, 'total_tokens': 206}}

In [26]:
full_data_serialized = []
for i in full_data:
    full_data_serialized.append(chatcompletion_to_dict(i))

In [27]:
import os

# Define the folder and file paths
folder_path = "../full_data"
file_name = "llama_advanced_dev_message_w_basic_prompt_records.json"
file_path = os.path.join(folder_path, file_name)

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Write to the file
with open(file_path, "w") as file:
    json.dump(full_data_serialized, file)

print(f"File created at: {file_path}")

File created at: ../full_data/llama_advanced_dev_message_w_basic_prompt_records.json


In [36]:
response = client.chat.completions.create(
        model="llama3.1-405b",
        store=True,
        messages=[
                {
                    "role": "developer",
                    "content": advanced_developer_message
                },
                {
                    "role": "user",
                    "content": basic_prompts[0]
                }
            ],
        stop=None
    )
response

ChatCompletion(id=None, choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are ten rows of synthetic healthcare data, each row representing a patient\'s information in JSON-like format:\n\n```\n[\n  {\n    "Name": "Evelyn Thompson",\n    "Age": 62,\n    "Gender": "female",\n    "Blood Type": "O",\n    "Medical Condition": "hypertension",\n    "Date of Admission": "2022-01-10",\n    "Doctor": "Dr. Rodriguez (Cardiologist)",\n    "Hospital": "', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1736275582, model='llama3.1-405b', object=None, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=142, prompt_tokens=64, total_tokens=206, completion_tokens_details=None, prompt_tokens_details=None))

In [37]:
response.choices[0].message.content

'Here are ten rows of synthetic healthcare data, each row representing a patient\'s information in JSON-like format:\n\n```\n[\n  {\n    "Name": "Evelyn Thompson",\n    "Age": 62,\n    "Gender": "female",\n    "Blood Type": "O",\n    "Medical Condition": "hypertension",\n    "Date of Admission": "2022-01-10",\n    "Doctor": "Dr. Rodriguez (Cardiologist)",\n    "Hospital": "'