In [10]:
import re
import string

from scripts.executor import ExecutionResult


def eval_model_results(results: list[ExecutionResult], message_extactor=None):
    tmp = []
    for result in results:
        riddle_answer = result.riddle.answer
        riddle_answer_letter = string.ascii_uppercase[result.riddle.label]
        model_answer = result.model_output.get_ai_response().content
        if message_extactor:
            model_answer = message_extactor(model_answer)
        correct = riddle_answer_letter == model_answer or model_answer == riddle_answer
        if not correct:
            regex_res = re.search(r"\((A|B|C|D)\)", model_answer)
            if model_answer.startswith("("):
                model_answer = model_answer[1:2]
            elif model_answer.startswith(
                (
                    f"{riddle_answer_letter} ",
                    f"{riddle_answer_letter})",
                    f"{riddle_answer_letter}\n",
                    f"{riddle_answer_letter}.",
                )
            ):
                model_answer = model_answer[0:1]
            elif model_answer.startswith(f" {riddle_answer_letter}"):
                model_answer = model_answer[1:2]
            elif model_answer.startswith(
                f"**{riddle_answer_letter}",
            ):
                model_answer = model_answer[2:3]
            elif regex_res:
                model_answer = regex_res.group(1)

        # Update the model answer in the result, as it might have been changed
        correct = riddle_answer_letter == model_answer or model_answer.startswith(
            riddle_answer
        )
        tmp.append(correct)
    correct_answers = sum(tmp)
    total_answers = len(tmp)
    correct_answers_percentage = correct_answers / total_answers * 100
    return correct_answers_percentage

In [11]:
from pathlib import Path

import dill as pickle


def print_results(path: Path | str):
    if isinstance(path, str):
        path = Path(path)
    results_wrapper = None
    with open(path, "rb") as f:
        results_wrapper = pickle.load(f)

    for model, results in results_wrapper.items():
        print(f"{model}: {eval_model_results(results)}%")


print("Results for the naive prompted models")
print_results("results/sp_results_naive.pkl")

print("\n\n")
print("Results for the few shot prompted models")
print_results("results/sp_results_few_shot_n_4.pkl")

Results for the naive prompted models
llama3.1:8b: 45.13556618819777%
llama3.2:1b: 9.090909090909092%
llama3.2:3b: 25.996810207336523%
phi3.5:3.8b: 1.1164274322169059%
phi4:14b: 75.11961722488039%
qwen2.5:0.5b: 17.22488038277512%
qwen2.5:1.5b: 15.94896331738437%
qwen2.5:3b: 20.095693779904305%
qwen2.5:7b: 51.03668261562998%
qwen2.5:14b: 56.61881977671451%
qwen2.5:32b: 68.74003189792663%
gemma2:2b: 11.164274322169058%
gemma2:9b: 76.39553429027113%
gemma2:27b: 82.93460925039872%
mistral-nemo:12b: 49.122807017543856%



Results for the few shot prompted models
llama3.1:8b: 68.37881219903691%
llama3.2:1b: 24.398073836276083%
llama3.2:3b: 51.36436597110754%
phi3.5:3.8b: 7.3836276083467105%
phi4:14b: 78.81219903691814%
qwen2.5:0.5b: 30.658105939004816%
qwen2.5:1.5b: 57.46388443017657%
qwen2.5:3b: 58.42696629213483%
qwen2.5:7b: 74.95987158908507%
qwen2.5:14b: 68.86035313001605%
qwen2.5:32b: 77.36757624398074%
gemma2:2b: 39.96789727126806%
gemma2:9b: 77.36757624398074%
gemma2:27b: 82.825040128