In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from plotnine import *

In [None]:
results_annotated = (
    pd
    .read_json("../data/results_annotated.jsonl", lines=True, orient='records')
    .assign(model = lambda d: d['model'].str.replace("bedrock/",""))
)

In [None]:
tmp = (
    results_annotated
    .assign(score = lambda d: d['label'] == 'Correct')
    .pivot_table(index="model",columns="format",values="score",aggfunc="count")
    .fillna(0).astype(int)
)

tmp

In [None]:
tmp = (
    results_annotated
    .assign(score = lambda d: d['label'] == 'Correct')
    .pivot_table(index="model",columns="format",values="score",aggfunc="mean")
    .fillna(0).astype(float)
)

tmp

In [None]:
model_sort = [
    "gpt-3.5-turbo",
    "anthropic.claude-3-haiku-20240307-v1:0",
    "anthropic.claude-3-sonnet-20240229-v1:0",
    "gemini-1.5-pro-preview-0409",
    "gpt-4-turbo-2024-04-09",
    "anthropic.claude-3-opus-20240229-v1:0",
    "gpt-4o-2024-05-13"
][::-1]

method_sort = ["direct_ask","structured_output"]

In [None]:
summary_df = (
    results_annotated
    .assign(method = lambda d: d['format'].str.replace("with_instructor","structured_output"))
    .assign(score = lambda d: d['label'] == 'Correct')
    .groupby(["model","method"],as_index=False)
    ["score"]
    .mean()
    .sort_values(by='score',ascending=False)
    .assign(
        model = lambda d: pd.Categorical(d['model'], categories=model_sort),
        method = lambda d: pd.Categorical(d['method'], categories=method_sort),
        )
)


In [None]:
p = (
    ggplot(summary_df)
    + theme_538()
    + aes(x="factor(method)", y="score", fill="model")
    + geom_bar(stat="identity", position="dodge",color="black")
    + scale_fill_brewer(type='qual', palette='Set3')
    + theme(figure_size=(4.5 * 1.618, 4.5))
    + labs(
        x="Method",
        y="Correct %",
        fill="Model",
        title="ABBA Eval: How good are these LLMs at rhyming in ABBA scheme?",
        subtitle="A small sample evaluation: 45 per (model, method) combination",
    )
)
p

In [None]:
p.save("../output/results.png", dpi=300)