In [None]:
import pandas as pd
from pathlib import Path
import re
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.style.use('ggplot')

In [None]:
CSV_PATHS = 'https://scandeval.com/{language}-nlg.csv'
# CSV_PATHS = str(Path.home() / "gitsky" / "scandeval.com" / "{language}-nlg-test.csv")
CSV_PATHS

In [None]:
LANGUAGES = ['da', 'no', 'sv', 'de', 'nl', 'en']

In [None]:
models = {
    "gpt-4-0613": "gpt-4",
    "gpt-4-1106-preview": "gpt-4-turbo",
    "gpt-4o-2024-05-13": "gpt-4o",
    "meta-llama/Llama-2-70b-hf": "llama-2-70b",
    "meta-llama/Meta-Llama-3-70B": "llama-3-70b",
}

all_models = list(models.keys())
all_new_models = list(models.values())

results_dict = dict()
for language in LANGUAGES:
    df = (
        pd
        .read_csv(CSV_PATHS.format(language="germanic"))
        .query('merge == False')
        .set_index('model_id')
    )
    df.index = df.index.map(lambda x: re.sub('\(.*\)', '', x).strip())
    scores = df[f'{language}_rank']

    for model in gpt4_models:
        if model not in scores:
            scores.loc[model] = scores.loc[list(gpt4_models.keys())[0]]
            
    scores = scores.loc[all_models]
    results_dict[language] = scores.tolist()

df = pd.DataFrame(results_dict, index=all_new_models)
df

In [None]:
gpt4_models = {model_id: model_name for model_id, model_name in models.items() if "gpt-4" in model_name}
gpt4_values = df.loc[list(gpt4_models.values())[0]]
gpt4_values

In [None]:
gpt4_df = df.loc[list(gpt4_models.values())]
gpt4_df = gpt4_values - gpt4_df
gpt4_df["germanic"] = gpt4_df.mean(axis=1)
gpt4_df

In [None]:
plt.figure(figsize=(5,4))

plt.plot(gpt4_df.en, label="English", color='red')
plt.plot(gpt4_df.da, label="Danish", color='blue')
plt.plot(gpt4_df.sv, label="Swedish", color='purple')
plt.plot(gpt4_df.no, label="Norwegian", color='black')
plt.plot(gpt4_df.de, label="German", color='green')
plt.plot(gpt4_df.nl, label="Dutch", color='yellow')

plt.fill_between(x=gpt4_df.index.tolist(), y1=-0.4, y2=0.0, facecolor='red', alpha=0.05, label="Worse than GPT-4")

plt.title("LLM Performance of GPT-4 models")
plt.ylim(-0.4, 0.1)
plt.ylabel('Negative ScandEval rank relative to GPT-4', fontsize=10)
plt.legend(bbox_to_anchor=(1, 1))

plt.savefig('gpt4-drop.png', bbox_inches="tight", dpi=300)
plt.show()

In [None]:
df["mean"] = df.mean(axis=1)
df

In [None]:
plt.figure(figsize=(5,4))
plt.title('ScandEval Germanic Rank (lower is better)', fontsize=13)

gpt4_model_names = [model_name for model_name in models.values() if "gpt-4" in model_name]
plt.bar(x=gpt4_model_names, height=df.loc[gpt4_model_names, "mean"])

llama_model_names = [model_name for model_name in models.values() if "llama" in model_name]
plt.bar(x=llama_model_names, height=df.loc[llama_model_names, "mean"])

plt.axhline(y=df.loc["gpt-4o", "mean"], linestyle="--")

plt.xticks(rotation=20)
plt.ylim(0.9, 2.0)
plt.savefig('gpt4-llama.png', bbox_inches="tight", dpi=300)
plt.show()