## Test set creation

- From `covid_qa_deepset`
- 8-2 split, rng = 2023, test n = 404

In [None]:
from askem.data import get_covid_qa

test_data = get_covid_qa(split="test")
test_data


In [None]:
import pandas as pd

petal_df = pd.read_csv("data/petal_bench.csv")


def get_petal_ans(example):
    """Get the answer from Petal scrape."""

    try:
        example["petal_answer"] = petal_df.query(
            f"id == {example['id']}"
        ).answer.values[0]
    except IndexError:
        example["petal_answer"] = None
    example["true_answer"] = example["answers"]["text"][0]
    return example


In [None]:
test_data = test_data.map(get_petal_ans)
test_data = test_data.filter(lambda x: x["petal_answer"] is not None)


In [None]:
test_data.to_parquet("data/petal_bench.parquet")

In [None]:
petal_bench = pd.read_parquet("data/petal_bench.parquet")

In [None]:
from bert_score import score

y_true = petal_bench["true_answer"].tolist()
y_pred = petal_bench["petal_answer"].tolist()
precision, recall, f1 = score(y_true, y_pred, lang="en", verbose=True)


Save to parquet for easier access

In [None]:
petal_bench["precision"] = precision.numpy()
petal_bench["recall"] = recall.numpy()
petal_bench["f1"] = f1.numpy()
petal_bench.to_parquet("data/petal_bench.parquet")


## Visualize

In [None]:
import pandas as pd
import altair as alt

df = pd.read_parquet("data/petal_bench.parquet")
alt.Chart(df[["f1"]]).mark_bar().encode(
    x=alt.X("f1", bin=alt.Bin()),
    y="count()",
)


In [None]:
print(f"BERT-f1 with COVID-QA: {df.f1.mean():.4f}")