In [9]:
import re
import string

from scripts.executor import ExecutionResult


def eval_model_results(results: list[ExecutionResult], message_extactor=None):
    tmp = []
    for result in results:
        riddle_answer = result.riddle.answer
        riddle_answer_letter = string.ascii_uppercase[result.riddle.label]
        model_answer = result.model_output.get_ai_response().content
        if message_extactor:
            model_answer = message_extactor(model_answer)
        correct = riddle_answer_letter == model_answer or model_answer == riddle_answer
        if not correct:
            regex_res = re.search(r"\((A|B|C|D)\)", model_answer)
            if model_answer.startswith("("):
                model_answer = model_answer[1:2]
            elif model_answer.startswith(
                (
                    f"{riddle_answer_letter} ",
                    f"{riddle_answer_letter})",
                    f"{riddle_answer_letter}\n",
                    f"{riddle_answer_letter}.",
                )
            ):
                model_answer = model_answer[0:1]
            elif model_answer.startswith(
                f"**{riddle_answer_letter}",
            ):
                model_answer = model_answer[2:3]
            elif regex_res:
                model_answer = regex_res.group(1)

        # Update the model answer in the result, as it might have been changed
        correct = riddle_answer_letter == model_answer or model_answer.startswith(
            riddle_answer
        )
        tmp.append(correct)
    correct_answers = sum(tmp)
    total_answers = len(tmp)
    correct_answers_percentage = correct_answers / total_answers * 100
    return correct_answers_percentage

In [7]:
import dill as pickle

results_wrapper = None
with open("results/sp_results_naive.pkl", "rb") as f:
    results_wrapper = pickle.load(f)

for model, results in results_wrapper.items():
    print(f"{model}: {eval_model_results(results)}%")

print("\n")
print("\n")
with open("results/wp_results_naive.pkl", "rb") as f:
    results_wrapper = pickle.load(f)

for model, results in results_wrapper.items():
    print(f"{model}: {eval_model_results(results)}%")

llama3.1:8b: 45.13556618819777%
llama3.2:1b: 9.090909090909092%
llama3.2:3b: 25.996810207336523%
phi3.5:3.8b: 1.1164274322169059%
phi4:14b: 75.11961722488039%
qwen2.5:0.5b: 17.22488038277512%
qwen2.5:1.5b: 15.94896331738437%
qwen2.5:3b: 20.095693779904305%
qwen2.5:7b: 51.03668261562998%
qwen2.5:14b: 56.61881977671451%
qwen2.5:32b: 68.74003189792663%
gemma2:2b: 11.164274322169058%
gemma2:9b: 76.39553429027113%
gemma2:27b: 82.93460925039872%
mistral-nemo:12b: 49.122807017543856%




llama3.1:8b: 35.77235772357724%
llama3.2:1b: 2.0325203252032518%
llama3.2:3b: 15.650406504065039%
phi3.5:3.8b: 1.8292682926829267%
phi4:14b: 64.02439024390245%
qwen2.5:0.5b: 12.398373983739837%
qwen2.5:1.5b: 3.2520325203252036%
qwen2.5:3b: 10.365853658536585%
qwen2.5:7b: 7.926829268292683%
qwen2.5:14b: 29.065040650406505%
qwen2.5:32b: 54.87804878048781%
gemma2:2b: 3.4552845528455287%
gemma2:9b: 63.41463414634146%
gemma2:27b: 68.69918699186992%
mistral-nemo:12b: 24.390243902439025%


In [11]:
def extract_deepseek_answer(model_answer: str):
    think_end_index = model_answer.find("</think>")
    if think_end_index == -1:
        return model_answer
    return model_answer[think_end_index + 8 :]  # 8 is the length of "</think>"


deepspeed_results = None
with open(
    "results/checkpoints/sp_results_naive_async/deepseek-r1:8b_sp_results_naive_async.pkl",
    "rb",
) as f:
    deepspeed_results = pickle.load(f)

    print(eval_model_results(deepspeed_results, extract_deepseek_answer))

40.5103668261563


In [None]:
import numpy as np

results = np.load(
    "results/checkpoints/sp_results_few_shot_n_{n_shots}/llama3.1:8b_sp_results_few_shot_n_{n_shots}.pkl",
    allow_pickle=True,
)
display(results)