# Generate Synthetic Dataset for Text Embedding Use Cases

Code authored by: Shaw Talebi <br>
Article link: https://towardsdatascience.com/text-embeddings-classification-and-semantic-search-8291746220be <br>
Video link: https://youtu.be/sNa_uiqSlJo

### imports

In [1]:
import openai
from sk import my_sk 
import time

import pandas as pd

In [2]:
def wait_for_assistant(thread, run):
    """
        Function to periodically check run status of AI assistant and print run time
    """

    # wait for assistant process prompt
    t0 = time.time()
    while run.status != 'completed':

        # retreive status of run (this might take a few seconds or more)
        run = client.beta.threads.runs.retrieve(
          thread_id=thread.id,
          run_id=run.id
        )

        # wait 0.5 seconds
        time.sleep(0.25)
    dt = time.time() - t0
    print("Elapsed time: " + str(dt) + " seconds")
    
    return run

### create resume generator assistant

In [3]:
# setup communication with API
client = openai.OpenAI(api_key=my_sk)

In [4]:
# define instruction string
intstructions_string = """ResumeGenerator is designed as an input-output system with minimal interaction. \
It focuses on creating fake resumes in a neutral and professional tone, covering specified sections: names, summary, professional experience, education, technical skills, certifications, awards, and honors. \ 
It creates fictional resumes based on the user's description. It never asks for more details and uses its best judgment to fill in any gaps in user requests. \
Providing straightforward, efficient service with little back-and-forth communication."""

In [5]:
# create ai assistant
assistant = client.beta.assistants.create(
    name="ResumeGenerator",
    instructions=intstructions_string,
    model="gpt-3.5-turbo"
)

### generate resumes

In [6]:
def generateResume(user_message):
    """
        Function to generate fake resume based on user description.
    """
    
    # create thread (i.e. object that handles conversations between user and assistant)
    thread = client.beta.threads.create()
    
    # add a user message to the thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=user_message
    )
    
    # send message to assistant to generate a response
    run = client.beta.threads.runs.create(
      thread_id=thread.id,
      assistant_id=assistant.id,
    )
    
    # wait for assistant process prompt
    run = wait_for_assistant(thread, run)
    
    # view messages added to thread
    messages = client.beta.threads.messages.list(
      thread_id=thread.id
    )
    
    return messages.data[0].content[0].text.value

In [7]:
# create fake resumes based on various data/AI roles

# define dataset names
dataset_name_list = ["train", "test"]

# define role descriptions to pass to ai assistant and number of resumes to generate for each
description_list = ["Data Scientist", "Data Engineer", "Machine Learning Engineer", "AI Consultant", "Data Entrepreneur", "Generate a random resume, you decide the roles and industry."]
count_list = [40,20,20,10,5,5]

for dataset_name in dataset_name_list:
    # initialize dict to store resume and role data
    resume_dict = {'resume':[], 'role':[]}
    
    if dataset_name == "test":
        count_list = [20,10,10,5,3,2]
    
    for i in range(len(description_list)):
        description = description_list[i]
        for j in range(count_list[i]):
            resume_dict['resume'].append(generateResume(description))
            if i==len(description_list):
                description = "Random"
            resume_dict['role'].append(description)


    # store resumes in dataframe
    df_resume = pd.DataFrame.from_dict(resume_dict)
    # save dataframe as csv
    df_resume.to_csv('resumes/resumes_'+dataset_name+'.csv', index=False)

Elapsed time: 4.384328842163086 seconds
Elapsed time: 11.78203272819519 seconds
Elapsed time: 7.122700929641724 seconds
Elapsed time: 8.246670007705688 seconds
Elapsed time: 7.86118483543396 seconds
Elapsed time: 6.5407631397247314 seconds
Elapsed time: 7.022247076034546 seconds
Elapsed time: 13.170545816421509 seconds
Elapsed time: 16.05043077468872 seconds
Elapsed time: 18.199455976486206 seconds
Elapsed time: 11.325892925262451 seconds
Elapsed time: 10.318438053131104 seconds
Elapsed time: 6.152220964431763 seconds
Elapsed time: 6.6268908977508545 seconds
Elapsed time: 4.661313056945801 seconds
Elapsed time: 13.27452802658081 seconds
Elapsed time: 15.360554933547974 seconds
Elapsed time: 8.352412939071655 seconds
Elapsed time: 9.832673788070679 seconds
Elapsed time: 13.09963583946228 seconds
Elapsed time: 8.201879024505615 seconds
Elapsed time: 5.202651023864746 seconds
Elapsed time: 7.206186056137085 seconds
Elapsed time: 11.50045919418335 seconds
Elapsed time: 6.8400719165802 seco