# Import libraries

For openai>=1.0.0, openai.ChatCompletion is not supported anymore. Install an earlier version. Install cohere and tiktoken to address the Error message that recently came up:
<blockquote>ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.<br>
llmx 0.0.15a0 requires cohere, which is not installed.<br>
llmx 0.0.15a0 requires tiktoken, which is not installed.</blockquote>

In [None]:
!pip install openai==0.27.8 cohere tiktoken
!pip install python-dotenv

In [None]:
import inspect
import os
import dotenv
import openai
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directories to store inferences

In [None]:
os.makedirs('/content/drive/MyDrive/datasets/AWA_q1d_gpt-3.5-turbo-0301',
            exist_ok=True)
os.makedirs('/content/drive/MyDrive/datasets/AWA_q1d_gpt-3.5-turbo-0613',
            exist_ok=True)

# Authentication

You have to first save your OpenAI API key in a .env file in your Google Drive, in the format:
OPENAI_API_KEY = '...'.

In [None]:
# OpenAI authentication on Google Colab. via
# https://stackoverflow.com/a/77166086
dotenv.load_dotenv('/content/drive/MyDrive/.env')
openai.api_key = os.getenv('OPENAI_API_KEY')

# Import dataset

In [None]:
largeScale_tweets = pd.read_csv(
    '/content/drive/MyDrive/datasets/largeScale_tweets.csv')
largeScale_tweets = largeScale_tweets.reset_index(drop=True)

# Set prompts

In [None]:
def system_content(persona):
    return inspect.cleandoc(
        f"""From now on, act as {persona}.
        Provide an answer that {persona} would regarding the post.""")

In [None]:
def user_content(post):
    return inspect.cleandoc(f"""Post: \"\"\"
    {post}
    \"\"\"

    In your opinion, how racist is this post?

    Please tell us your answer using this scale from 1 (Not at all) to 5 (Very
    much so).

    Constraint: Even if you are uncertain, you must pick either “1”, “2”, “3”,
    “4”, or “5” without using any other words.""")

# Define a function to call Chat Completion API with (persona) or without (no-persona default) the *system message*

In [None]:
# Prevent rate limit errors. via
# https://github.com/openai/openai-cookbook/blob/main/examples/
# How_to_handle_rate_limits.ipynb
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def inference_with_backoff(user_cont, model, system_cont=None):
    if system_cont is None:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": user_cont}
            ],
            temperature=0,
        )
    else:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_cont},
                {"role": "user", "content": user_cont}
            ],
            temperature=0,
        )
    return response['choices'][0]['message']['content']

# Define a function to perform model inference on **AWA**

Create a function `predict_q1d_ratings` that takes a list of personas (e.g., ['a White/Caucasian person', 'a Black/African American']), a `topic` (e.g., a sociodemographic attribute such as *Race/Ethnicity*), and a GPT model name (e.g., gpt-3.5-turbo-0301) as required input arguments. The optional argument `base` determines if ratings made by the no-persona default should be included. For each persona in the list of personas, the function iterates over each post in **AWA**, calling the function `inference_with_backoff` for each post. It then adds the respective lists of ratings predicted by each simulated persona as new columns to the **AWA** dataframe. The dataframe is saved in TSV file format, and the function returns the dataframe.

In [None]:
def predict_q1d_ratings(persona_list, topic, model, base=None):
    df = largeScale_tweets.copy(deep=True)
    posts = df['tweet'].values
    if base == 'Yes':
        df['predicted_q1d_ratings_base'] = [
            inference_with_backoff(user_content(post),
                                   model) for post in posts]
    for persona in persona_list:
        persona += ' in the United States'
        df[f'predicted_q1d_ratings_{persona}'.replace(' ', '_')] = [
            inference_with_backoff(user_content(post),
                                   model,
                                   system_content(persona)) for post in posts]
    df.to_csv(
        f'/content/drive/MyDrive/datasets/AWA_q1d_{model}/'
        f'AWA_q1d_predictions_{topic}_{model}.tsv',
        sep='\t', index=False)
    return df

# Create dictionaries, each with a `topic` as the key and a list of personas as the value

In [None]:
race_ethnicity = {
    'race_ethnicity': ['an Asian/Asian American',
                       'a Black/African American',
                       'a Hispanic/Latinx person',
                       'a White/Caucasian person',
                       'a Native Hawaiian/Pacific Islander',
                       'a Native American/First Nations person']}

In [None]:
gender = {
    'gender': ['a man/male',
               'a woman/female',
               'a non-binary person']}

In [None]:
age_range = {
    'age_range': ['a person (aged 18-24)',
                  'a person (aged 25-29)',
                  'a person (aged 30-34)',
                  'a person (aged 35-39)',
                  'a person (aged 40-44)',
                  'a person (aged 45-49)',
                  'a person (aged 50-54)',
                  'a person (aged 55-59)',
                  'a person (aged 60-64)',
                  'a person (aged 65+)']}

# Call the `predict_q1d_ratings` function

Running each of the following cells could take several hours. Colab Pro+ includes continuous code execution (capped at 24 hours) and background execution capabilities, enabling you to close your browser/device while your code runs. CPU is sufficient. Alternatively, you can download the notebooks and edit the code accordingly to point to the directories on your local machine you want to use.

In [None]:
predict_q1d_ratings(
    list(race_ethnicity.values())[0],
    list(race_ethnicity.keys())[0],
    'gpt-3.5-turbo-0301',
    'Yes')

In [None]:
predict_q1d_ratings(
    list(race_ethnicity.values())[0],
    list(race_ethnicity.keys())[0],
    'gpt-3.5-turbo-0613',
    'Yes')

In [None]:
predict_q1d_ratings(
    list(gender.values())[0],
    list(gender.keys())[0],
    'gpt-3.5-turbo-0301')

In [None]:
predict_q1d_ratings(
    list(gender.values())[0],
    list(gender.keys())[0],
    'gpt-3.5-turbo-0613')

In [None]:
predict_q1d_ratings(
    list(age_range.values())[0],
    list(age_range.keys())[0],
    'gpt-3.5-turbo-0301')

In [None]:
predict_q1d_ratings(
    list(age_range.values())[0],
    list(age_range.keys())[0],
    'gpt-3.5-turbo-0613')