In [23]:
import json
import pandas as pd
from tqdm import tqdm

from openai import OpenAI
from sklearn.metrics import f1_score
from datasets import load_dataset, concatenate_datasets
from tenacity import retry, stop_after_attempt, wait_fixed # for exponential backoff

# init openAI client. You need to set your openAI API key in your environment variables to make this work
client = OpenAI()

# GPT version which should be used
MODEL_ID = "gpt-3.5"
# system prompt for the GPT model
SYSTEM_PROMPT = "you are an expert on clinical trials in the medical domain. You will get up to two pieces of evidence and a statement. Your job is to decide if the statement makes logical sense, given the pieces of evidence."
# location of the preprocessed dataset
DATA_LOCATION = "../data/"

In [19]:
# load data
data = load_dataset('json', data_files={'train': [f'{DATA_LOCATION}train.json'],
                                           'val': [f'{DATA_LOCATION}val.json'],
                                         'test': [f'{DATA_LOCATION}test.json']})

In [20]:
# concatenate train + val data, so we can explore both together
# test with only 100, so it does not get too expensive
val_data = data['val'].to_pandas()[:100]

In [21]:
# retry decorator to avoid hitting usage limits
@retry(wait=wait_fixed(10), stop=stop_after_attempt(6))
def sendPrompt(prompt: str, model_id):
    """Sends a prompt to a GPT model via the OpenAI API

    Args:
        prompt: prompt to send to the model
        model_id: OpenAI model ID (e.g. gpt-3.5)

    Returns:
        response of the model
    """

    completion = client.chat.completions.create(
    model=model_id,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}
    ]
    )

    #print(completion.choices[0].message.content)
    return completion.choices[0].message.content


def label_to_id(label: str):
    """Converts the output label of the model to ints, so we can analyze the performance.
    Entailment is converted to 0, contradiction is converted to 1

    Args:
        label (str): the label to be converted

    Returns:
        the int label
    """    
    pred = label.strip().lower()
    # sometimes the models reply yes/no -> translate this to entailment/contradiction
    if pred == "entailment" or pred == "yes":
        return 1
    if pred == "contradiction" or pred == "no":
        return 0
    return 0

In [None]:
# generate predictions
# Create an empty list to store predictions
predictions = []

# Iterate over the rows in val_data with tqdm for progress bar
for index, row in tqdm(val_data.iterrows(), total=len(val_data)):
    # Call sendPrompt function and append the prediction to the list
    predictions.append(sendPrompt(row['text'], MODEL_ID))
val_data['prediction'] = predictions

In [11]:
def evaluate(data):
    row['prediction'] = row['prediction'].apply(label_to_id)
    row['label'] = row['label'].apply(label_to_id)
    
    return f1_score(row['label'],row['prediction'])

In [12]:
evaluate(val_data)

0.8823529411764707

## Results
gpt3.5-turbo: 0.6629834254143647 (costs 0.16 $ for 200 examples)

gpt4: 0.8823529411764707 (costs 2.10 $ for 100 examples)