## Test set creation

- From `covid_qa_deepset`
- 8-2 split, rng = 2023, test n = 404

In [None]:
from askem.data import COVID_QA, GPTBench
from askem.openai import get_answer
from tqdm.notebook import tqdm

test_data = COVID_QA.train_test_split(test_size=0.2, seed=2023)["test"]
test_data

## Query ChatGPT
gpt-3.5-turbo-0301

Run once

In [None]:
# done_ids = [row.id for row in GPTBench.select(GPTBench.id)]

# for data in tqdm(test_data):
#     if data["id"] in done_ids:
#         continue

#     gpt_answer = get_answer(data["context"], data["question"])
#     GPTBench.insert(
#         id=data["id"],
#         context=data["context"],
#         question=data["question"],
#         true_answer=data["answers"],
#         gpt_answer=gpt_answer,
#     ).on_conflict_ignore().execute()

## Calculate BERT-f1

In [None]:
from askem.data import GPTBench, to_df
from ast import literal_eval
from bert_score import score

df = to_df(GPTBench)
df[["id", "true_answer", "gpt_answer"]]

In [None]:
def to_text(true_answer: str) -> str:
    """Strip the true answer to text."""
    return literal_eval(true_answer)["text"][0]


y_true = [to_text(x) for x in df["true_answer"]]
y_pred = df["gpt_answer"].tolist()

precision, recall, f1 = score(y_true, y_pred, lang="en", verbose=True)


Save to parquet for easier access

In [None]:
df["precision"] = precision.numpy()
df["recall"] = recall.numpy()
df["f1"] = f1.numpy()
df.to_parquet("data/gpt_bench.parquet")

## Visualize

In [None]:
import pandas as pd
import altair as alt

df = pd.read_parquet("data/gpt_bench.parquet")
alt.Chart(df).mark_bar().encode(
    x=alt.X("f1", bin=alt.Bin()),
    y="count()",
)


In [None]:
print(f"ChatGPT BERT-f1 with COVID-QA: {df.f1.mean():.4f}")
