In [None]:
import heapq

import dill as pickle

from scripts.evaluation import eval_model_results


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score = eval_model_results(result)

                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "prompt_type": prompt_type,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=3):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score = eval_model_results(result)

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type and score to the list
                best_prompts[model][dataset_type].append(
                    {"prompt_type": prompt_type, "score": score}
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Get the top n prompts by sorting the list based on score (highest score first)
            sorted_prompts = heapq.nlargest(n, prompts, key=lambda x: x["score"])
            top_n_prompts_for_model[dataset_type] = sorted_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Load the results
results_wrapper = None
with open("results/results_test_0_system-prime-messages.pkl", "rb") as f:
    results_wrapper = pickle.load(f)

# Get the best prompt type for each model
best_prompt_types = get_best_prompt_for_each_model(results_wrapper)
print(best_prompt_types)

# Save the best prompt types
with open("results/best_prompt_types.pkl", "wb") as f:
    pickle.dump(best_prompt_types, f)

{'llama3.1:8b': {'sp': {'prompt_type': 'default_improved', 'score': 62.0}, 'wp': {'prompt_type': 'pattern_matching', 'score': 38.0}}, 'llama3.2:1b': {'sp': {'prompt_type': 'confidence', 'score': 6.0}, 'wp': {'prompt_type': 'confidence', 'score': 12.0}}, 'llama3.2:3b': {'sp': {'prompt_type': 'default', 'score': 32.0}, 'wp': {'prompt_type': 'intuitive', 'score': 24.0}}, 'phi3.5:3.8b': {'sp': {'prompt_type': 'default', 'score': 34.0}, 'wp': {'prompt_type': 'assumption_challenge', 'score': 30.0}}, 'phi4:14b': {'sp': {'prompt_type': 'default', 'score': 86.0}, 'wp': {'prompt_type': 'default_improved', 'score': 66.0}}, 'qwen2.5:0.5b': {'sp': {'prompt_type': 'step_by_step', 'score': 26.0}, 'wp': {'prompt_type': 'step_by_step', 'score': 28.000000000000004}}, 'qwen2.5:1.5b': {'sp': {'prompt_type': 'step_by_step', 'score': 14.000000000000002}, 'wp': {'prompt_type': 'step_by_step', 'score': 20.0}}, 'qwen2.5:3b': {'sp': {'prompt_type': 'intuitive', 'score': 20.0}, 'wp': {'prompt_type': 'intuitive',

In [3]:
import dill as pickle

results_wrapper = None
with open("results/results_test_0_system-prime-messages.pkl", "rb") as f:
    results_wrapper = pickle.load(f)

# Get the best prompt type for each model
top_n_prompts = get_best_n_prompts_for_each_model(results_wrapper)

In [4]:
for model, dataset_dict in top_n_prompts.items():
    print(f"--- Best Prompt Types for Model: {model} ---")
    for dataset_type, prompts in dataset_dict.items():
        print(f"\n  Dataset: {dataset_type}")
        print(f"  {'-' * 30}")
        for i, prompt in enumerate(prompts, 1):
            print(f"    {i}. {prompt['prompt_type']} (Score: {prompt['score']:.2f})")
    print("\n" + "=" * 50)  # Separator between models

--- Best Prompt Types for Model: llama3.1:8b ---

  Dataset: sp
  ------------------------------
    1. default_improved (Score: 62.00)
    2. pattern_matching (Score: 62.00)
    3. intuitive (Score: 54.00)

  Dataset: wp
  ------------------------------
    1. pattern_matching (Score: 38.00)
    2. metaphor (Score: 36.00)
    3. default (Score: 34.00)

--- Best Prompt Types for Model: llama3.2:1b ---

  Dataset: sp
  ------------------------------
    1. confidence (Score: 6.00)
    2. default (Score: 2.00)
    3. default_improved (Score: 0.00)

  Dataset: wp
  ------------------------------
    1. confidence (Score: 12.00)
    2. step_by_step (Score: 10.00)
    3. default_improved (Score: 8.00)

--- Best Prompt Types for Model: llama3.2:3b ---

  Dataset: sp
  ------------------------------
    1. default (Score: 32.00)
    2. intuitive (Score: 26.00)
    3. step_by_step (Score: 24.00)

  Dataset: wp
  ------------------------------
    1. intuitive (Score: 24.00)
    2. creative (Sc

In [7]:
from pathlib import Path


def print_results(path: Path | str):
    if isinstance(path, str):
        path = Path(path)
    results_wrapper = None
    with open(path, "rb") as f:
        results_wrapper = pickle.load(f)

    for model, results in results_wrapper.items():
        print(f"{model}: {eval_model_results(results)}%")


print("Results for the q4 prompted models")
print_results("results/sp_results_naive.pkl")

print("\n\n")
print("Results for the q8 prompted models")
print_results("results/sp_results_naive_q8.pkl")

print("\n\n")

print("Results for the final prompted models best templates")
print_results("results/sp_results_best_templates.pkl")

Results for the q4 prompted models
llama3.1:8b: 45.13556618819777%
llama3.2:1b: 9.090909090909092%
llama3.2:3b: 25.996810207336523%
phi3.5:3.8b: 1.1164274322169059%
phi4:14b: 75.11961722488039%
qwen2.5:0.5b: 17.384370015948964%
qwen2.5:1.5b: 15.94896331738437%
qwen2.5:3b: 20.095693779904305%
qwen2.5:7b: 51.03668261562998%
qwen2.5:14b: 56.61881977671451%
qwen2.5:32b: 68.74003189792663%
gemma2:2b: 11.164274322169058%
gemma2:9b: 76.55502392344498%
gemma2:27b: 82.93460925039872%
mistral-nemo:12b: 49.122807017543856%



Results for the q8 prompted models
llama3.1:8b-instruct-q8_0: 36.68261562998405%
llama3.2:1b-instruct-q8_0: 9.090909090909092%
llama3.2:3b-instruct-q8_0: 26.6347687400319%
phi3.5:3.8b-mini-instruct-q8_0: 4.944178628389154%
phi4:14b-q8_0: 76.55502392344498%
qwen2.5:0.5b-instruct-q8_0: 13.716108452950559%
qwen2.5:1.5b-instruct-q8_0: 15.789473684210526%
qwen2.5:3b-instruct-q8_0: 27.91068580542265%
qwen2.5:7b-instruct-q8_0: 56.77830940988836%
qwen2.5:14b-instruct-q8_0: 59.011164