In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [3]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [4]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://107.222.215.224:17001"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-02-27 11:46:10,299 - INFO - Initialized executor with 15 models.


In [5]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)


def create_test_dataset(data: list[RiddleQuestion], name: str, percentage: float = 0.1):
    """Create a test dataset by randomly sampling a percentage of the original data."""
    indices = np.random.choice(
        len(data), size=int(len(data) * percentage), replace=False
    )
    return Dataset(name=name, riddles=[data[i] for i in indices])


# Create test datasets
sp_data = create_test_dataset(dataset.sp, "sp")
wp_data = create_test_dataset(dataset.wp, "wp")

# Prepare executor data
executor_data = [sp_data, wp_data]

In [6]:
from scripts.prompt_helpers import system_templates

total_results = {}

for technique in system_templates:
    chat_prompt_template = create_prompt_template(technique)
    results = await executor.aexecute(
        executor_data,
        chat_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="zero_shot_system_prompt",
        file_name_suffix=technique,
    )

2025-02-27 11:46:10,317 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,435 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default_improved'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default-improved):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,559 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'step_by_step'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(step-by-step):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,585 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'creative'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(creative):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,707 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'elimination'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(elimination):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,746 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'metaphor'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(metaphor):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,873 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'confidence'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(confidence):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,998 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'perspective_shift'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(perspective-shift):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,028 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'common_sense'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(common-sense):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,158 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'assumption_challenge'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(assumption-challenge):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,291 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'pattern_matching'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(pattern-matching):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 12:04:34,237 - INFO - Saving results to results/zero-shot-system-prompt/zero-shot-system-prompt_pattern-matching_results.pkl
2025-02-27 12:04:34,681 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'intuitive'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(intuitive):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 12:31:31,580 - INFO - Saving results to results/zero-shot-system-prompt/zero-shot-system-prompt_intuitive_results.pkl


## Eval results


In [1]:
import os
import glob
import pickle
from pathlib import Path

# Define the results directory path
results_dir = Path("results/zero-shot-system-prompt")

# Get all result files
result_files = glob.glob(str(results_dir / "zero-shot-system-prompt_*_results.pkl"))

# Load all results into a dictionary
# The first key is the suffix (technique name)
total_results = {}

for file_path in result_files:
    # Extract the suffix from the filename
    suffix = os.path.basename(file_path).split("_")[1]

    # Load the results from the pickle file
    with open(file_path, "rb") as f:
        wrapped_results = pickle.load(f)
        total_results[suffix] = wrapped_results.results

print(f"Loaded {len(total_results)} result sets from disk.")

Loaded 12 result sets from disk.


In [16]:
import heapq

import numpy as np

from scripts.evaluation import eval_model_results


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score = eval_model_results(result)

                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "prompt_type": prompt_type,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=3):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score = eval_model_results(result)

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type and score to the list
                best_prompts[model][dataset_type].append(
                    {"prompt_type": prompt_type, "score": score}
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Get the top n prompts by sorting the list based on score (highest score first)
            sorted_prompts = heapq.nlargest(n, prompts, key=lambda x: x["score"])
            top_n_prompts_for_model[dataset_type] = sorted_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Get the best prompt type for each model
best_prompt_types = get_best_prompt_for_each_model(total_results)
print(best_prompt_types)

# Save the best prompt types
with open("results/best_system_prompts_by_model.pkl", "wb") as f:
    pickle.dump(best_prompt_types, f)

{'llama3.1:8b': {'sp': {'prompt_type': 'confidence', 'score': 59.67741935483871}, 'wp': {'prompt_type': 'default-improved', 'score': 57.14285714285714}}, 'llama3.2:1b': {'sp': {'prompt_type': 'step-by-step', 'score': 19.35483870967742}, 'wp': {'prompt_type': 'step-by-step', 'score': 8.16326530612245}}, 'llama3.2:3b': {'sp': {'prompt_type': 'default-improved', 'score': 41.935483870967744}, 'wp': {'prompt_type': 'perspective-shift', 'score': 36.734693877551024}}, 'phi3.5:3.8b': {'sp': {'prompt_type': 'elimination', 'score': 25.806451612903224}, 'wp': {'prompt_type': 'creative', 'score': 46.93877551020408}}, 'phi4:14b': {'sp': {'prompt_type': 'default-improved', 'score': 77.41935483870968}, 'wp': {'prompt_type': 'default-improved', 'score': 67.3469387755102}}, 'qwen2.5:0.5b': {'sp': {'prompt_type': 'elimination', 'score': 29.03225806451613}, 'wp': {'prompt_type': 'common-sense', 'score': 34.69387755102041}}, 'qwen2.5:1.5b': {'sp': {'prompt_type': 'metaphor', 'score': 29.03225806451613}, '