# GPT Annotation (batch per request)

### Import required libraries

In [None]:
from openai import OpenAI
import pandas as pd
from dotenv import load_dotenv
import os
from time import sleep

load_dotenv()

True

## GPT Annotation

### Set API key

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#### Read input and define output location

In [12]:
input_path = ".././../../../data/raw/articles/guardian/2022_articles_guardian_raw.csv" # can be modified
output_path = "2022_articles_guardian_raw_annotated.csv" # can be modified

df = pd.read_csv(input_path, sep=';')

(179, 10)


### Function to run the annotation

In [3]:
def get_res(batch):
    # Define the system and user messages
    system_message = "You are an excellent annotator. You will be provided with a batch of items. Each item consists of a title and one or multiple tags that both belong to one news article. The items start with <itemStart> and end with <itemEnd>. Your task is to classify the topic of a news article by looking at its title and tags in either Category-1: politics, Category-2: business and economy, Category-3: environment, Category-4: sports, Category-5: entertainment and culture, Category-6: science and technology or Category-7: health. Return for each article only the number of the category in a python list where the result matches the order of the input."
    user_message = f"{batch}"

    # Send the messages to the GPT-3 model and get the response
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0,
        max_tokens=1000
    )

    # Extract the categories from the response
    categories = response.choices[0].message.content[1:-1].split(",")

    return categories

### Create batches and call function to run the annotation

In [None]:
batchSize = 10

cat = {
     1: "politics",
     2: "business and economy",
     3: "environment",
     4: "sports",
     5: "entertainment and culture",
     6: "science and technology",
     7: "health"
    }

counter = 0
idx_lo = 0
idx_hi = 0
batch = ""
for index, row in df.iterrows():
    print(index)
    if counter < batchSize:
        title = row['Title']
        tags = row['Tags'][1:-1]
        counter += 1
        batch += f"<itemStart> ARTICLE-{counter}: Title: {title}; Tags: {tags} <itemEnd>"
    else:
        categories = get_res(batch)
        print(categories)
        if len(categories) < 10:
             break
        for idx_category, category in enumerate(categories):
            print(index + 1 - len(categories)+ idx_category, category)
            df.loc[index - len(categories) + idx_category, ['Category']] = category.strip().replace(",", "")
        sleep(3)
        title = row['Title']
        tags = row['Tags'][1:-1]
        counter = 1
        batch = f"<itemStart> ARTICLE-{counter}: Title: '{title}'; Tags: {tags}] <itemEnd>"

if len(batch) > 0:
    categories = get_res(batch)
    for idx_category, category in enumerate(categories):
            print(df.shape[0] - len(categories)+ idx_category, category)
            df.loc[df.shape[0] - len(categories) + idx_category, ['Category']] = category.strip().replace(",", "")


df.to_csv(output_path, sep=';')
