In [12]:
from tqdm import tqdm
import os
import openai
import re
import random 
import numpy as np
import pandas as pd
import dotenv
dotenv.load_dotenv(".env", override=True)

# Replace with your own API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [13]:
# Define the correlation matrix
corr_matrix = np.array([[0.4, 0.7],
                        [0.6, 0.3],
                        [0.3, 0.4],
                        [0.4, 0.2],
                        [0.3, 0.4]])

# Define the categories for each characteristic
genders = ['male', 'female']
occupations = ['nurse', 'factory worker']
age_groups = ['20-30', '30-40', '40-50']

# Define the basic statistic of gender, we want 50/50 male and female
gender_prob = 0.5

# Generate 100 personas
n_personas = 100
personas = []

for i in range(n_personas):
    # Sample the gender using gender_prob
    gender = np.random.choice(genders, p=[gender_prob, 1-gender_prob])

    # Sample the occupation based on the gender
    occupation_probs = corr_matrix[0:2, 0]
    if gender == 'female':
        occupation_probs *= corr_matrix[0:2, 1]    
    occupation_probs /= occupation_probs.sum()
    occupation_idx = np.random.choice(range(len(occupations)), p=occupation_probs)
    occupation = occupations[occupation_idx]
    
    # Sample the age group
    age_probs = corr_matrix[2:5, 0]
    if gender == 'female':
        age_probs *= corr_matrix[2:5, 1]
    age_probs /= age_probs.sum()
    age_idx = np.random.choice(range(len(age_groups)), p=age_probs)
    age_group = age_groups[age_idx]

    # Add the persona to the list
    personas.append({'gender': gender, 'occupation': occupation, 'age_group': age_group})

# output to csv
df_personas = pd.DataFrame(personas)
df_personas.to_csv('personas.csv', index=False)

In [14]:
# Read the persona data from csv
persona_data = pd.read_csv('personas.csv')
persona_data

Unnamed: 0,gender,occupation,age_group
0,male,nurse,30-40
1,male,factory worker,40-50
2,female,factory worker,40-50
3,male,nurse,40-50
4,male,nurse,40-50
...,...,...,...
95,female,nurse,20-30
96,male,nurse,40-50
97,female,nurse,20-30
98,female,nurse,20-30


In [15]:
# Define columns for input and output
input_columns = persona_data.columns.to_list()
output_columns = [
    'name',
    'gender',
    'age',
    'occupation',
    'background',
    'hobbies',
    'likes',
    'dislikes'
]

In [16]:
# Generate the list of prompts
prompts = []
for j in range(len(persona_data)):    
    persona = persona_data.iloc[j]
    prompt = "Build a persona using the following characteristics:\n"
    for i in range(len(input_columns)):
        column = input_columns[i]
        prompt += f"{column}: {persona[column]}\n"
    for i in range(len(output_columns)):
        column = output_columns[i]
        prompt += f"{column}: <{column}>\n"
    prompts.append(prompt)

In [17]:
print(prompts[0])

Build a persona using the following characteristics:
gender: male
occupation: nurse
age_group: 30-40
name: <name>
gender: <gender>
age: <age>
occupation: <occupation>
background: <background>
hobbies: <hobbies>
likes: <likes>
dislikes: <dislikes>



In [18]:
# Send the first 5 prompts to the OpenAI API and collect the output
outputs = []
messages = []
for prompt in tqdm(prompts[0:5]):
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )    
    text = response.choices[0].message.content.strip()
    outputs.append(text)


100%|██████████| 5/5 [00:35<00:00,  7.18s/it]


In [19]:
# Parse the output into a pandas dataframe
data = []
for i in range(len(outputs)):
    row = {}
    for o in outputs[i].split('\n'):
        try:
            row[o.split(": ")[0].strip()] = o.split(": ")[1].strip()
        except:
            pass
    data.append(row)

df_persona = pd.DataFrame(data)
df_persona


Unnamed: 0,Name,Gender,Age,Occupation,Background,Hobbies,Likes,Dislikes
0,Ethan,Male,35,Nurse,Ethan grew up in a small town in the Midwest o...,Ethan enjoys running and hiking in his free ti...,Ethan takes pride in his work as a nurse and f...,Ethan is not a fan of people who are inconside...
1,John,Male,45,Factory Worker,John comes from a blue-collar family and has b...,"John enjoys spending time outdoors, fishing, a...",John is a family-oriented person and values sp...,"John dislikes injustice, dishonesty, and disre..."
2,Maria Rodriguez,Female,45,Factory Worker,Maria grew up in a working-class family in a s...,"Maria loves to read, especially romance novels...",Maria is a hard worker and takes pride in doin...,Maria is not a fan of change and can be resist...
3,David,Male,45,Nurse,David has been a nurse for 20 years and has wo...,"David enjoys going for long walks in nature, t...",He likes being able to make a positive impact ...,David dislikes seeing patients suffer due to i...
4,David Johnson,Male,45,Nurse,"David grew up in a small town in the Midwest, ...","David enjoys spending time outdoors, hiking, a...",David is passionate about improving the health...,David is frustrated by the current state of th...


In [20]:
# Save the output to a csv file
df_persona.to_csv('output.csv', index=False)