In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [3]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [4]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://107.222.215.224:17001"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-02-27 11:46:10,299 - INFO - Initialized executor with 15 models.


In [5]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)


def create_test_dataset(data: list[RiddleQuestion], name: str, percentage: float = 0.1):
    """Create a test dataset by randomly sampling a percentage of the original data."""
    indices = np.random.choice(
        len(data), size=int(len(data) * percentage), replace=False
    )
    return Dataset(name=name, riddles=[data[i] for i in indices])


# Create test datasets
sp_data = create_test_dataset(dataset.sp, "sp")
wp_data = create_test_dataset(dataset.wp, "wp")

# Prepare executor data
executor_data = [sp_data, wp_data]

In [6]:
from scripts.prompt_helpers import system_templates

total_results = {}

for technique in system_templates:
    chat_prompt_template = create_prompt_template(technique)
    results = await executor.aexecute(
        executor_data,
        chat_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="zero_shot_system_prompt",
        file_name_suffix=technique,
    )

2025-02-27 11:46:10,317 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,435 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default_improved'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default-improved):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,559 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'step_by_step'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(step-by-step):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,585 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'creative'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(creative):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,707 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'elimination'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(elimination):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,746 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'metaphor'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(metaphor):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,873 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'confidence'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(confidence):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:10,998 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'perspective_shift'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(perspective-shift):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,028 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'common_sense'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(common-sense):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,158 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'assumption_challenge'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(assumption-challenge):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 11:46:11,291 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'pattern_matching'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(pattern-matching):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 12:04:34,237 - INFO - Saving results to results/zero-shot-system-prompt/zero-shot-system-prompt_pattern-matching_results.pkl
2025-02-27 12:04:34,681 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'intuitive'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(intuitive):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-02-27 12:31:31,580 - INFO - Saving results to results/zero-shot-system-prompt/zero-shot-system-prompt_intuitive_results.pkl


## Eval results


In [2]:
import os
import glob
import pickle
from pathlib import Path

# Define the results directory path
results_dir = Path("results/zero-shot-system-prompt")

# Get all result files
result_files = glob.glob(str(results_dir / "zero-shot-system-prompt_*_results.pkl"))

# Load all results into a dictionary
# The first key is the suffix (technique name)
total_results = {}

for file_path in result_files:
    # Extract the suffix from the filename
    suffix = os.path.basename(file_path).split("_")[1]

    # Load the results from the pickle file
    with open(file_path, "rb") as f:
        wrapped_results = pickle.load(f)
        total_results[suffix] = wrapped_results.results

print(f"Loaded {len(total_results)} result sets from disk.")

Loaded 12 result sets from disk.


In [13]:
import heapq

import numpy as np
import pandas as pd
from IPython.display import display

from scripts.evaluation import eval_model_results
from scripts.prompt_helpers import system_templates


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score = eval_model_results(result)

                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "prompt_type": prompt_type,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=3):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score = eval_model_results(result)

                # Get the prompt string length
                prompt_length = len(system_templates[prompt_type])

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type, score, and length to the list
                best_prompts[model][dataset_type].append(
                    {
                        "prompt_type": prompt_type,
                        "prompt_length": prompt_length,
                        "score": score,
                    }
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Get the top n prompts by sorting the list based on score (highest score first)
            sorted_prompts = heapq.nlargest(n, prompts, key=lambda x: x["score"])
            top_n_prompts_for_model[dataset_type] = sorted_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Get the best prompt type for each model
best_prompt_types = get_best_n_prompts_for_each_model(total_results, n=10)

# Print the results as a formatted table using pandas

for model, dataset_dict in best_prompt_types.items():
    print(f"\n{'-' * 80}\nModel: {model}")
    for dataset_type, prompts in dataset_dict.items():
        print(f"\nDataset: {dataset_type}")

        # Create a DataFrame from the prompts data
        df = pd.DataFrame(prompts)

        # Rename columns for better display
        df = df.rename(
            columns={
                "prompt_type": "Prompt Type",
                "prompt_length": "Prompt Length",
                "score": "Score",
            }
        )

        # Format the score column to 4 decimal places
        df["Score"] = df["Score"].map("{:.4f}".format)

        # Display the DataFrame
        display(df)

# Save the best prompt types
# with open("results/best_system_prompts_by_model.pkl", "wb") as f:
#     pickle.dump(best_prompt_types, f)


--------------------------------------------------------------------------------
Model: llama3.1:8b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,59.6774
1,metaphor,187,56.4516
2,pattern-matching,176,56.4516
3,assumption-challenge,173,51.6129
4,default-improved,160,51.6129
5,elimination,182,50.0
6,intuitive,201,48.3871
7,creative,193,46.7742
8,default,24,43.5484
9,step-by-step,251,40.3226



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,57.1429
1,creative,193,55.102
2,confidence,178,48.9796
3,common-sense,180,46.9388
4,default,24,46.9388
5,metaphor,187,44.898
6,elimination,182,44.898
7,intuitive,201,40.8163
8,perspective-shift,179,40.8163
9,pattern-matching,176,40.8163



--------------------------------------------------------------------------------
Model: llama3.2:1b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,19.3548
1,metaphor,187,11.2903
2,pattern-matching,176,9.6774
3,intuitive,201,8.0645
4,perspective-shift,179,8.0645
5,common-sense,180,8.0645
6,confidence,178,8.0645
7,elimination,182,8.0645
8,creative,193,8.0645
9,assumption-challenge,173,8.0645



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,8.1633
1,default-improved,160,4.0816
2,intuitive,201,2.0408
3,perspective-shift,179,2.0408
4,common-sense,180,2.0408
5,confidence,178,2.0408
6,metaphor,187,2.0408
7,elimination,182,2.0408
8,pattern-matching,176,2.0408
9,creative,193,2.0408



--------------------------------------------------------------------------------
Model: llama3.2:3b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,41.9355
1,elimination,182,40.3226
2,assumption-challenge,173,40.3226
3,creative,193,38.7097
4,intuitive,201,37.0968
5,common-sense,180,33.871
6,perspective-shift,179,29.0323
7,confidence,178,29.0323
8,metaphor,187,25.8065
9,step-by-step,251,25.8065



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,perspective-shift,179,36.7347
1,default,24,32.6531
2,metaphor,187,30.6122
3,step-by-step,251,30.6122
4,creative,193,28.5714
5,common-sense,180,26.5306
6,assumption-challenge,173,26.5306
7,default-improved,160,26.5306
8,intuitive,201,22.449
9,elimination,182,22.449



--------------------------------------------------------------------------------
Model: phi3.5:3.8b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,25.8065
1,confidence,178,22.5806
2,pattern-matching,176,20.9677
3,default,24,19.3548
4,metaphor,187,14.5161
5,creative,193,14.5161
6,step-by-step,251,11.2903
7,perspective-shift,179,8.0645
8,intuitive,201,6.4516
9,assumption-challenge,173,4.8387



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,46.9388
1,metaphor,187,36.7347
2,default-improved,160,36.7347
3,default,24,34.6939
4,elimination,182,26.5306
5,assumption-challenge,173,24.4898
6,pattern-matching,176,20.4082
7,confidence,178,18.3673
8,perspective-shift,179,16.3265
9,common-sense,180,14.2857



--------------------------------------------------------------------------------
Model: phi4:14b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,77.4194
1,default,24,77.4194
2,creative,193,50.0
3,common-sense,180,43.5484
4,pattern-matching,176,38.7097
5,assumption-challenge,173,35.4839
6,intuitive,201,33.871
7,perspective-shift,179,33.871
8,confidence,178,32.2581
9,metaphor,187,30.6452



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,67.3469
1,default,24,46.9388
2,metaphor,187,42.8571
3,step-by-step,251,42.8571
4,elimination,182,40.8163
5,creative,193,38.7755
6,perspective-shift,179,36.7347
7,common-sense,180,36.7347
8,confidence,178,36.7347
9,pattern-matching,176,36.7347



--------------------------------------------------------------------------------
Model: qwen2.5:0.5b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,29.0323
1,perspective-shift,179,25.8065
2,metaphor,187,24.1935
3,pattern-matching,176,24.1935
4,step-by-step,251,24.1935
5,default-improved,160,24.1935
6,common-sense,180,22.5806
7,confidence,178,20.9677
8,creative,193,19.3548
9,assumption-challenge,173,17.7419



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,common-sense,180,34.6939
1,step-by-step,251,34.6939
2,assumption-challenge,173,34.6939
3,elimination,182,32.6531
4,metaphor,187,30.6122
5,pattern-matching,176,28.5714
6,perspective-shift,179,24.4898
7,confidence,178,22.449
8,intuitive,201,20.4082
9,default-improved,160,20.4082



--------------------------------------------------------------------------------
Model: qwen2.5:1.5b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,metaphor,187,29.0323
1,assumption-challenge,173,29.0323
2,common-sense,180,27.4194
3,pattern-matching,176,24.1935
4,default,24,24.1935
5,creative,193,22.5806
6,step-by-step,251,22.5806
7,perspective-shift,179,19.3548
8,confidence,178,19.3548
9,default-improved,160,19.3548



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,common-sense,180,10.2041
1,creative,193,10.2041
2,step-by-step,251,10.2041
3,default,24,10.2041
4,metaphor,187,8.1633
5,intuitive,201,4.0816
6,assumption-challenge,173,4.0816
7,perspective-shift,179,2.0408
8,confidence,178,2.0408
9,elimination,182,2.0408



--------------------------------------------------------------------------------
Model: qwen2.5:3b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,27.4194
1,default-improved,160,20.9677
2,default,24,19.3548
3,confidence,178,17.7419
4,metaphor,187,17.7419
5,creative,193,17.7419
6,common-sense,180,14.5161
7,assumption-challenge,173,14.5161
8,perspective-shift,179,12.9032
9,pattern-matching,176,11.2903



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,26.5306
1,perspective-shift,179,24.4898
2,creative,193,24.4898
3,elimination,182,22.449
4,intuitive,201,20.4082
5,default-improved,160,20.4082
6,common-sense,180,18.3673
7,metaphor,187,18.3673
8,pattern-matching,176,18.3673
9,assumption-challenge,173,16.3265



--------------------------------------------------------------------------------
Model: qwen2.5:7b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,70.9677
1,confidence,178,69.3548
2,elimination,182,69.3548
3,metaphor,187,66.129
4,creative,193,66.129
5,assumption-challenge,173,66.129
6,intuitive,201,64.5161
7,common-sense,180,64.5161
8,perspective-shift,179,62.9032
9,default,24,62.9032



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,14.2857
1,creative,193,12.2449
2,default-improved,160,12.2449
3,metaphor,187,10.2041
4,pattern-matching,176,10.2041
5,default,24,10.2041
6,intuitive,201,8.1633
7,confidence,178,8.1633
8,step-by-step,251,8.1633
9,perspective-shift,179,6.1224



--------------------------------------------------------------------------------
Model: qwen2.5:14b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,70.9677
1,creative,193,61.2903
2,metaphor,187,59.6774
3,elimination,182,59.6774
4,step-by-step,251,59.6774
5,perspective-shift,179,54.8387
6,default-improved,160,54.8387
7,intuitive,201,53.2258
8,assumption-challenge,173,51.6129
9,pattern-matching,176,50.0



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,57.1429
1,confidence,178,44.898
2,metaphor,187,44.898
3,step-by-step,251,42.8571
4,default-improved,160,40.8163
5,perspective-shift,179,34.6939
6,common-sense,180,32.6531
7,pattern-matching,176,32.6531
8,intuitive,201,30.6122
9,assumption-challenge,173,30.6122



--------------------------------------------------------------------------------
Model: qwen2.5:32b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,metaphor,187,74.1935
1,confidence,178,72.5806
2,default-improved,160,69.3548
3,pattern-matching,176,67.7419
4,creative,193,67.7419
5,elimination,182,66.129
6,assumption-challenge,173,66.129
7,perspective-shift,179,62.9032
8,common-sense,180,62.9032
9,intuitive,201,56.4516



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,metaphor,187,61.2245
1,default-improved,160,61.2245
2,confidence,178,57.1429
3,pattern-matching,176,57.1429
4,creative,193,57.1429
5,assumption-challenge,173,48.9796
6,perspective-shift,179,44.898
7,common-sense,180,44.898
8,elimination,182,42.8571
9,intuitive,201,40.8163



--------------------------------------------------------------------------------
Model: gemma2:2b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,50.0
1,creative,193,41.9355
2,step-by-step,251,41.9355
3,metaphor,187,40.3226
4,perspective-shift,179,38.7097
5,pattern-matching,176,38.7097
6,confidence,178,37.0968
7,default-improved,160,35.4839
8,common-sense,180,33.871
9,default,24,32.2581



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,30.6122
1,metaphor,187,26.5306
2,common-sense,180,18.3673
3,elimination,182,18.3673
4,confidence,178,16.3265
5,step-by-step,251,16.3265
6,intuitive,201,14.2857
7,default,24,14.2857
8,perspective-shift,179,12.2449
9,pattern-matching,176,12.2449



--------------------------------------------------------------------------------
Model: gemma2:9b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,metaphor,187,83.871
1,creative,193,83.871
2,default-improved,160,83.871
3,intuitive,201,82.2581
4,perspective-shift,179,82.2581
5,pattern-matching,176,82.2581
6,common-sense,180,80.6452
7,confidence,178,75.8065
8,assumption-challenge,173,75.8065
9,default,24,69.3548



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,73.4694
1,pattern-matching,176,71.4286
2,default-improved,160,69.3878
3,common-sense,180,65.3061
4,confidence,178,63.2653
5,metaphor,187,61.2245
6,default,24,61.2245
7,intuitive,201,59.1837
8,perspective-shift,179,59.1837
9,assumption-challenge,173,57.1429



--------------------------------------------------------------------------------
Model: gemma2:27b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,perspective-shift,179,90.3226
1,assumption-challenge,173,90.3226
2,default-improved,160,90.3226
3,common-sense,180,87.0968
4,confidence,178,87.0968
5,pattern-matching,176,87.0968
6,intuitive,201,85.4839
7,metaphor,187,85.4839
8,elimination,182,85.4839
9,creative,193,85.4839



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,79.5918
1,elimination,182,77.551
2,creative,193,77.551
3,default-improved,160,77.551
4,intuitive,201,75.5102
5,perspective-shift,179,75.5102
6,metaphor,187,75.5102
7,pattern-matching,176,75.5102
8,common-sense,180,71.4286
9,default,24,69.3878



--------------------------------------------------------------------------------
Model: mistral-nemo:12b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,61.2903
1,confidence,178,58.0645
2,metaphor,187,58.0645
3,creative,193,58.0645
4,intuitive,201,54.8387
5,common-sense,180,53.2258
6,step-by-step,251,51.6129
7,elimination,182,48.3871
8,pattern-matching,176,48.3871
9,perspective-shift,179,46.7742



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,38.7755
1,common-sense,180,30.6122
2,intuitive,201,26.5306
3,creative,193,26.5306
4,confidence,178,24.4898
5,pattern-matching,176,24.4898
6,default-improved,160,24.4898
7,default,24,24.4898
8,perspective-shift,179,22.449
9,metaphor,187,20.4082
