In [3]:
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from openai import AsyncOpenAI
from typing import Optional, List
import enum

load_dotenv()

aclient = instructor.apatch(AsyncOpenAI())

In [4]:
import json

class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"

class Location(BaseModel):
    city: str
    state_or_province: str
    country: str

class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))


class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [5]:
SYSTEM_PROMPT_GENERATE = """
You are a helpful AI assistant that generates mock data for different profiles. Given a 20newsgroup category, generate 20 fake profiles for people who might be interested in news from that category.
For example: 
Input: comp.sys.ibm.pc.hardware
Output: 20 fake profiles for people interested in computers and hardware where news in this category would be relevant to them.

Do not hyper tailor the profiles to the category. For example, if the category is about computers, do not generate profiles for people who are only interested in computers. Instead, generate profiles for people who are interested in computers and other things. The data should be realistic and varied.
"""

In [11]:
async def generate_fake_profile_by_news_group(news_group: str) -> FakeProfile:
    profile = await aclient.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=FakeProfiles,
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT_GENERATE
            },
            {
                "role": "user",
                "content": news_group
            }
        ],
        max_retries=3
    )

    return profile

In [7]:
async def generate_and_save_profiles(news_group: str):
    profile = await generate_fake_profile_by_news_group(news_group)
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "w") as f:
        f.write(profile.model_dump_json())

In [10]:
NEWSGROUPS = [
    'alt.atheism',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'sci.med',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

In [12]:
for news_group in NEWSGROUPS:
    await generate_and_save_profiles(news_group)