In [None]:
import os
import asyncio
import time
from openai import AsyncOpenAI, OpenAIError # Use AsyncOpenAI for asyncio
from dotenv import load_dotenv

# --- Configuration ---
load_dotenv()  # Load variables from .env file
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable not set.")

# Initialize the Async OpenAI client
# It's good practice to initialize it once if possible,
# but for simplicity in this function, we'll create it inside.
# For applications with many calls, consider passing the client instance.
# client = AsyncOpenAI(api_key=API_KEY)

MODEL = "gpt-4.1-mini" # Or "gpt-4", "gpt-4-turbo", etc.
MAX_TOKENS = 29000
TEMPERATURE = 0.7
MAX_CONCURRENT_REQUESTS = 500 # Adjust based on your rate limits and needs

# Use a semaphore to limit concurrent requests
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

sys_prompt = """ **Input:** A clear description of the research problem or question.
2. **Output:** A well-structured research approach. This could include:
    - Hypothesis (if applicable)
    - Proposed methodology (e.g., survey, experiment, simulation, case study, theoretical analysis)
    - Key steps involved
    - Data collection plan (if applicable)
    - Data analysis plan (if applicable)
    - Potential challenges or limitations
    - Ethical considerations (if applicable)"""

# --- Single Prompt Processing Function ---
async def process_single_prompt(client: AsyncOpenAI, prompt: str, index: int) -> dict:
    """
    Sends a single prompt to the OpenAI API asynchronously, handling errors.
    Uses a semaphore to limit concurrency.
    """
    async with semaphore: # Wait for semaphore if max concurrency is reached
        print(f"Processing prompt {index + 1}...")
        try:
            start_time = time.time()
            response = await client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=MAX_TOKENS,
                temperature=TEMPERATURE,
                # You can add other parameters like top_p, presence_penalty etc.
            )
            end_time = time.time()
            result_content = response.choices[0].message.content.strip()
            print(f"Finished prompt {index + 1} in {end_time - start_time:.2f}s ({response.usage.completion_tokens} tkns)")
            return {
                "index": index,
                "prompt": prompt,
                "response": result_content,
                "status": "success",
                "error": None,
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        except OpenAIError as e:
            print(f"Error processing prompt {index + 1}: {e}")
            return {
                "index": index,
                "prompt": prompt,
                "response": None,
                "status": "error",
                "error": str(e),
                "prompt_tokens": -1,
                "completion_tokens": -1
            }
        except Exception as e:
            print(f"Unexpected error processing prompt {index + 1}: {e}")
            return {
                "index": index,
                "prompt": prompt,
                "response": None,
                "status": "error",
                "error": f"Unexpected error: {str(e)}",
                "prompt_tokens": -1,
                "completion_tokens": -1
            }

# --- Batch Pipeline Function ---
async def batch_openai_pipeline(prompts: list[str]) -> list[dict]:
    """
    Processes a list of prompts concurrently using the OpenAI API.
    """
    # Initialize client here for the batch duration
    async_client = AsyncOpenAI(api_key=API_KEY)

    # Create a list of tasks for asyncio to run
    # Pass the client instance to each task
    tasks = [
        process_single_prompt(async_client, prompt, i)
        for i, prompt in enumerate(prompts)
    ]

    # Run tasks concurrently and gather results
    # return_exceptions=True ensures that if one task fails, others continue,
    # and the exception is returned in the results list instead of crashing.
    print(f"Starting batch processing for {len(prompts)} prompts...")
    start_batch_time = time.time()
    results = await asyncio.gather(*tasks, return_exceptions=True)
    end_batch_time = time.time()
    print(f"Batch processing finished in {end_batch_time - start_batch_time:.2f}s")

    # Process results to handle potential exceptions returned by gather
    final_results = []
    for i, result in enumerate(results):
        if isinstance(result, Exception):
            print(f"Task for prompt {i+1} failed with an exception: {result}")
            # You might want to create a specific error entry here
            final_results.append({
                "index": i,
                "prompt": prompts[i], # Get original prompt
                "response": None,
                "status": "gather_error",
                "error": f"Gather exception: {str(result)}"
            })
        else:
            final_results.append(result) # Append the dictionary returned by process_single_prompt

    # Sort results by original index to maintain order
    final_results.sort(key=lambda x: x['index'])

    return final_results

# --- Example Usage ---
# async def main():
#     # sample_prompts = [
#     #     "Explain the concept of asynchronous programming in Python.",
#     #     "Write a short poem about a rainy day.",
#     #     "What is the capital of France?",
#     #     "Summarize the main plot points of 'Hamlet'.",
#     #     "Translate 'Hello, world!' to Spanish.",
#     #     "What are the benefits of using a semaphore?",
#     #     "Give me three healthy breakfast ideas.",
#     #     "Explain the difference between GPT-3.5 and GPT-4.",
#     #     "Write a python function to calculate factorial.",
#     #     "Who painted the Mona Lisa?"
#     #     # Add more prompts as needed
#     # ]
#     sample_prompts = [
#         "Generate 5 different novel research problems in the field of psychology. Denote each question with <problem></problem> tags.",
#     ]

#     results = await batch_openai_pipeline(sample_prompts)

#     print("\n--- Batch Results ---")
#     for result in results:
#         print(f"\nPrompt {result['index'] + 1}: {result['prompt']}")
#         if result['status'] == 'success':
#             print(f"Response: {result['response']}")
#             print(result['completion_tokens'])
#         else:
#             print(f"Status: {result['status']}")
#             print(f"Error: {result['error']}")
#         print("-" * 20)

#     return results

# if __name__ == "__main__":
#     # To run the async main function
#     # asyncio.run(main())
#     x = await main()

#     print(x)

In [None]:
def generate_prompts(topics):
    prompts = []
    for topic in topics:
        prompts.append(f"Generate 200 different novel academic research problems in the field of {topic}. Denote each question with <problem></problem> tags.")
    return prompts

async def main(prompts):
    # sample_prompts = [
    #     "Generate 5 different novel research problems in the field of psychology. Denote each question with <problem></problem> tags.",
    # ]

    results = await batch_openai_pipeline(prompts)

    print("\n--- Batch Results ---")
    for result in results:
        print(f"\nPrompt {result['index'] + 1}: {result['prompt']}")
        if result['status'] == 'success':
            print(f"Response: {result['response']}")
            print(result['completion_tokens'])
        else:
            print(f"Status: {result['status']}")
            print(f"Error: {result['error']}")
        print("-" * 20)

    return results

# if __name__ == "__main__":
#     # To run the async main function
#     # asyncio.run(main())
#     x = await main()

disciplines = [
    # --- Natural Sciences ---
    "Physics",
    "Chemistry",
    "Biology",
    "Earth Sciences",  # (Geology, Oceanography, Meteorology)
    "Astronomy",
    "Environmental Science",
    "Materials Science",

    # --- Life Sciences / Medicine ---
    "Medicine",
    "Neuroscience",
    "Genetics",
    "Ecology",
    "Molecular Biology",
    "Biochemistry",
    "Pharmacology",
    "Public Health",
    "Nursing",
    "Biomedical Engineering", # Straddles Engineering/Medicine

    # --- Formal Sciences ---
    "Mathematics",
    "Statistics",
    "Computer Science",
    "Logic", # Often within Philosophy or Math
    "Data Science", # Interdisciplinary but distinct focus

    # --- Social Sciences ---
    "Psychology",
    "Sociology",
    "Anthropology",
    "Economics",
    "Political Science",
    "Geography", # (Human Geography focus)
    "Communication Studies",
    "Education",
    "Archaeology", # Straddles Humanities/Social Sciences

    # --- Humanities ---
    "History",
    "Philosophy",
    "Literature", # (English, Comparative, etc.)
    "Linguistics",
    "Religious Studies",
    "Classics", # (Study of ancient Greece and Rome)
    "Art History",

    # --- Engineering & Technology ---
    "Mechanical Engineering",
    "Electrical Engineering",
    "Civil Engineering",
    "Chemical Engineering",
    "Aerospace Engineering",
    "Industrial Engineering",
    "Software Engineering", # Often within CS but distinct focus
    "Robotics", # Interdisciplinary

    # --- Arts ---
    "Visual Arts", # (Painting, Sculpture, Photography etc.)
    "Music", # (Musicology, Composition, Performance Studies)
    "Theatre Studies",
    "Film Studies",
    "Architecture",
    "Design", # (Graphic Design, Industrial Design, Interaction Design)

    # --- Professional Fields ---
    "Law",
    "Business Administration", # (Management, Strategy)
    "Finance",
    "Marketing",
    "Accounting",
    "Urban Planning",
    "Library and Information Science",
    "Social Work",
    "Agricultural Science",
]

responses = await main(generate_prompts(disciplines))

print(responses)

# Optional: Print the number of disciplines listed
# print(f"Generated a list of {len(disciplines)} disciplines.")

In [None]:
print(len(responses))

In [None]:
research_question_list = []
topic_list = []
for idx, r in enumerate(responses):
    output = r['response']
    while "<problem>" in output:
        problem = output[output.rfind('<problem>')+9:output.rfind('</problem>')]
        # print(problem)
        output = output[:output.rfind('<problem>')]
        research_question_list.append(problem)
        topic_list.append(disciplines[idx])

question_data = {"prompt": topic_list, "completion": research_question_list}
print(question_data)


In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(question_data)
df.to_csv('topic_question.csv', index=False)

df

In [None]:
responses = await main(df['prompt'].to_list())

print(responses)

In [None]:
approach_list = []
question_list = []
for idx, r in enumerate(responses):
    approach_list.append(r['response'])
    question_list.append(r['prompt'])
    # while "<problem>" in output:
    #     problem = output[output.rfind('<problem>')+9:output.rfind('</problem>')]
    #     # print(problem)
    #     output = output[:output.rfind('<problem>')]
    #     research_question_list.append(problem)
    #     topic_list.append(disciplines[idx])

approach_data = {"prompt": question_list, "completion": approach_list}

df_approach = pd.DataFrame.from_dict(approach_data)

df_combined = pd.merge(df, df_approach, on='question')

# df_approach

df_combined.to_csv('topic_question_approach.csv', index=False)

# Find duplicates in 'col1', keeping the first occurrence
duplicates_dropped = df_combined.drop_duplicates(subset=['prompt'])

# print("Duplicates (keep='first'):\n", duplicates)

# Filter the DataFrame to show duplicate rows

duplicates_dropped.to_csv('topic_question_approach.csv', index=False)
duplicates_dropped



In [None]:
from IPython.display import display, Markdown
import random

data_points = random.randint(50, 200)
threshold = 100
status = "**above**" if data_points > threshold else "*below*"

# Display the string as rendered Markdown
# display(Markdown(duplicates_dropped.iloc[0]['approach']))

loaded_df = pd.read_csv('topic_question_approach.csv')

none_values = loaded_df[loaded_df.isnull().any(axis=1)]

responses = await main(none_values['prompt'].to_list())

In [None]:
len(responses)

print("Original DataFrame:")
print(df)

# Create a copy to avoid modifying the original df if running multiple methods
df_loop_method = loaded_df.copy()

# Iterate through the list of dictionaries
for qa_item in responses:
    question_to_find = qa_item['prompt']
    answer_to_add = qa_item['response']

    # Find rows where the question matches and update the 'answer' column
    # .loc[row_indexer, column_indexer] = value
    df_loop_method.loc[df_loop_method['prompt'] == question_to_find, 'completion'] = answer_to_add

print("\nDataFrame Updated using loop and .loc:")
df_loop_method

df_loop_method.to_csv('topic_question_approach.csv', index=False)

truncated_dataset = df_loop_method.groupby('topic', group_keys=False).head(150)

truncated_dataset.to_csv('topic_question_approach_trunc.csv', index=False)

truncated_dataset
