In [22]:
import pandas as pd
import numpy as np
import httpx
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
import os
import re
from tqdm import tqdm

In [23]:
load_dotenv(dotenv_path='../.env')

API_KEY = os.getenv("OPENAI_API_KEY")

In [24]:
def extract_rating(s: str) -> int:
    try:
        return int(re.findall(r'\d+', s)[0])
    except IndexError:
        return -1

In [25]:
import re


def query_gpt35(chiefcomplaint):
    header = {
        'Authorization': f'Bearer {API_KEY}',
        'Content-Type': 'application/json',
    }

    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "system",
                "content": "You are an emergency department doctor.",
            },
            {
                "role": "user",
                "content": f"<s>[INST] You are an emergency department doctor. Please rate the following chief complaint on a scale of 1 to 10, where 10 is most severe. Provide only the rating in your answer. Start your answer with the rating in a numerical format. Like this: [RATING] [Explanation]. For example: 8. The chief complaint is very severe... Here is the chief complaint: {chiefcomplaint} [/INST]",
            },
        ],
        "max_tokens": 3,
    }

    url = 'https://api.openai.com/v1/chat/completions'

    try:
        response = httpx.post(url, headers=header, json=payload)
        response.raise_for_status()
        rating = response.json()['choices'][0]['message']['content']
    except (httpx.HTTPError, KeyError, IndexError) as e:
        print(f"An error occurred: {e}")
        rating = "None"

    return extract_rating(rating)


In [26]:
gptrated_frame = pd.read_csv('../processed_data/gptrated_frame.csv')

In [31]:
from time import sleep


for i, row in tqdm(gptrated_frame.iterrows(), total=gptrated_frame.shape[0]):
    if row['rating'] == -1:
        rating = query_gpt35(row['chiefcomplaint'])
        gptrated_frame.at[i, 'rating'] = rating

100%|██████████| 60407/60407 [10:17<00:00, 97.80it/s]  


In [29]:
gptrated_frame.to_csv('../processed_data/gptrated_frame.csv', index=False)

In [32]:
gptrated_frame["rating"].value_counts()

rating
7     35207
9      9448
8      6860
6      5793
5      1489
10     1273
4       137
3       112
2        63
0        17
1         8
Name: count, dtype: int64