# Import libraries

For openai>=1.0.0, openai.ChatCompletion is not supported anymore. Install an earlier version. Install cohere and tiktoken to address the Error message that recently came up:
<blockquote>ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.<br>
llmx 0.0.15a0 requires cohere, which is not installed.<br>
llmx 0.0.15a0 requires tiktoken, which is not installed.</blockquote>

In [None]:
!pip install openai==0.27.8 cohere tiktoken
!pip install python-dotenv

In [None]:
import inspect
import os
import dotenv
import openai
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directories to store inferences

In [None]:
os.makedirs('/content/drive/MyDrive/datasets/COVID-HATE_gpt-3.5-turbo-0301',
            exist_ok=True)
os.makedirs('/content/drive/MyDrive/datasets/COVID-HATE_gpt-3.5-turbo-0613',
            exist_ok=True)

# Authentication

You have to first save your OpenAI API key in a .env file in your Google Drive, in the format:
OPENAI_API_KEY = '...'.

In [None]:
# OpenAI authentication on Google Colab. via
# https://stackoverflow.com/a/77166086
dotenv.load_dotenv('/content/drive/MyDrive/.env')
openai.api_key = os.getenv('OPENAI_API_KEY')

# Import dataset

In [None]:
annotated_tweets_w_text = pd.read_csv(
    '/content/drive/MyDrive/datasets/annotated_tweets_w_text.csv')
annotated_tweets_w_text = annotated_tweets_w_text.reset_index(drop=True)

# Set prompts

In [None]:
def system_content(persona):
    return inspect.cleandoc(
        f"""From now on, act as {persona}.
        Provide an answer that {persona} would regarding the tweet.""")

In [None]:
def user_content(tweet):
    return inspect.cleandoc(f"""
    I want you to perform a data annotation task. In your output, I want you
    to return either “Hatespeech”, “Counterhate” or “Neutral”, depending on
    whether you think the following tweet is an anti-Asian COVID-19 hate
    tweet, a COVID-19 counterspeech tweet, or a neutral or irrelevant tweet.
    A tweet is an anti-Asian COVID-19 hate tweet if you perceive that it is
    antagonistic speech that is directed towards an Asian entity (individual
    person, organization, or country), and others the Asian outgroup through
    intentional opposition or hostility in the context of COVID-19. A tweet
    is a COVID-19 counterspeech tweet if you perceive that it either: (a)
    explicitly identifies, calls out, criticizes, condemns, challenges,
    or opposes racism, hate, or violence towards an Asian entity or (b)
    explicitly supports, expresses solidarity towards, or defends an Asian
    entity. A tweet is neutral or irrelevant if it neither explicitly nor
    implicitly conveys hate, nor counterspeech, but is related to COVID-19.
    Tweets in this neutral category also include news, advertisements,
    or outright spam. I want you to only respond with “Hatespeech”,
    “Counterhate” or “Neutral”. Do not provide any other outputs or any
    explanation for your output.

    Tweet: \"\"\"
    {tweet}
    \"\"\"""")

# Define a function to call Chat Completion API with (persona) or without (no-persona default) the *system message*

In [None]:
# Prevent rate limit errors. via
# https://github.com/openai/openai-cookbook/blob/main/examples/
# How_to_handle_rate_limits.ipynb
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def classifier_with_backoff(user_cont, model, system_cont=None):
    if system_cont is None:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": user_cont}
            ],
            temperature=0,
        )
    else:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_cont},
                {"role": "user", "content": user_cont}
            ],
            temperature=0,
        )
    return response['choices'][0]['message']['content']

# Define a function to perform model inference on **COVID-HATE**

Create a function `predict_labels` that takes a list of personas (e.g., ['a White person in the United States', 'a Black or African American in the United States']), a `topic` (e.g., a sociodemographic attribute such as *Race/Nationality*), and a GPT model name (e.g., gpt-3.5-turbo-0301) as required input arguments. The optional argument `base` determines if annotations made by the no-persona default should be included. For each persona in the list of personas, the function iterates over each tweet in **COVID-HATE**, calling the function `classifier_with_backoff` for each tweet. It then adds the respective lists of labels predicted by each simulated persona as new columns to the **COVID-HATE** dataframe. The predicted labels are remapped to conform to how the data was originally coded in **COVID-HATE**, e.g., `0` if `Neutral`. The dataframe is saved in TSV file format, and the function returns the dataframe.

In [None]:
def predict_labels(persona_list, topic, model, base=None):
    df = annotated_tweets_w_text.copy(deep=True)
    tweets = df['Text'].values
    if base == 'Yes':
        df['predicted_labels_base'] = [
            classifier_with_backoff(user_content(twt),
                                    model) for twt in tweets]
    for persona in persona_list:
        df[f'predicted_labels_{persona}'.replace(' ', '_')] = [
            classifier_with_backoff(user_content(twt),
                                    model,
                                    system_content(persona)) for twt in tweets]
    # Remap predicted class labels. via
    # https://stackoverflow.com/a/20250996
    remap = {'Neutral': 0,
             'Counterhate': 1,
             'Hatespeech': 2}
    for i in df.iloc[:, 3:].columns:
        df.replace({i: remap}, inplace=True)
    df.to_csv(
        f'/content/drive/MyDrive/datasets/COVID-HATE_{model}/'
        f'COVID-HATE_predictions_{topic}_{model}.tsv',
        sep='\t', index=False)
    return df

# Create dictionaries, each with a `topic` as the key and a list of personas as the value

For simplicity, given the small number of personas considered, we do not segregate them at this point and use a single dictionary.

In [None]:
all = {
    'all': ['a White person in the United States',
            'a Black or African American in the United States',
            'an American Indian or Alaska Native in the United States',
            'an Asian in the United States',
            'a Chinese person in the United States',
            'a Native Hawaiian or Other Pacific Islander in the United States',
            'a Chinese person in the People\'s Republic of China',
            'a Chinese undergraduate student in the United States',
            'an Indian undergraduate student in the United States']}

# Call the `predict_labels` function

Running each of the following cells could take several hours. Colab Pro+ includes continuous code execution (capped at 24 hours) and background execution capabilities, enabling you to close your browser/device while your code runs. CPU is sufficient. Alternatively, you can download the notebooks and edit the code accordingly to point to the directories on your local machine you want to use.

In [None]:
predict_labels(list(all.values())[0],
               list(all.keys())[0],
               'gpt-3.5-turbo-0301',
               'Yes')

In [None]:
predict_labels(list(all.values())[0],
               list(all.keys())[0],
               'gpt-3.5-turbo-0613',
               'Yes')