In [11]:
from scripts import setup_environment

setup_environment()

In [1]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [2]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [3]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://50.173.30.254:40106"
model_builder = OllamaModelBuilder(base_url, temperature=0)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-03-11 20:00:21,846 - INFO - Initialized executor with 15 models.


In [4]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)


def create_test_dataset(data: list[RiddleQuestion], name: str, percentage: float = 0.1):
    """Create a test dataset by randomly sampling a percentage of the original data."""
    indices = np.random.choice(
        len(data), size=int(len(data) * percentage), replace=False
    )
    return Dataset(name=name, riddles=[data[i] for i in indices])


# Create test datasets
sp_data = create_test_dataset(dataset.sp, "sp")
wp_data = create_test_dataset(dataset.wp, "wp")

# Prepare executor data
executor_data = [sp_data, wp_data]

In [5]:
from scripts.prompt_helpers import system_templates

total_results = {}

for technique in system_templates:
    chat_prompt_template = create_prompt_template(technique)
    results = await executor.aexecute(
        executor_data,
        chat_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="zero_shot_system_prompt",
        file_name_suffix=technique,
    )

2025-03-11 20:00:21,866 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:21,981 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'default-improved'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(default-improved):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,113 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'step-by-step'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(step-by-step):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,156 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'creative'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(creative):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,286 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'elimination'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(elimination):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,430 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'metaphor'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(metaphor):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,576 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'confidence'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(confidence):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,610 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'perspective-shift'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(perspective-shift):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,750 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'common-sense'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(common-sense):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,904 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'assumption-challenge'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(assumption-challenge):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:22,944 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'pattern-matching'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(pattern-matching):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-11 20:00:23,081 - INFO - Starting execution 'zero-shot-system-prompt with suffix 'intuitive'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


zero-shot-system-prompt(intuitive):   0%|          | 0/1665 [00:00<?, ?it/s]

## Eval results


In [6]:
import os
import glob
import pickle
from pathlib import Path

# Define the results directory path
results_dir = Path("results/zero-shot-system-prompt")

# Get all result files
result_files = glob.glob(str(results_dir / "zero-shot-system-prompt_*_results.pkl"))

# Load all results into a dictionary
# The first key is the suffix (technique name)
total_results = {}

for file_path in result_files:
    # Extract the suffix from the filename
    suffix = os.path.basename(file_path).split("_")[1]

    # Load the results from the pickle file
    with open(file_path, "rb") as f:
        wrapped_results = pickle.load(f)
        total_results[suffix] = wrapped_results.results

print(f"Loaded {len(total_results)} result sets from disk.")

Loaded 12 result sets from disk.


In [7]:
import numpy as np
import pandas as pd
from IPython.display import display

from scripts.evaluation import calculate_model_accuracy
from scripts.prompt_helpers import system_templates


def get_best_n_prompts_for_each_model(input_data, n=5, score_threshold_pct=6):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                _, _, score, _ = calculate_model_accuracy(result)

                # Get the prompt string length
                prompt_length = len(system_templates[prompt_type])

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type, score, and length to the list
                best_prompts[model][dataset_type].append(
                    {
                        "prompt_type": prompt_type,
                        "prompt_length": prompt_length,
                        "score": score,
                    }
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Sort prompts by score (highest first)
            sorted_prompts = sorted(prompts, key=lambda x: x["score"], reverse=True)

            # Get the best score
            best_score = sorted_prompts[0]["score"] if sorted_prompts else 0

            # Filter prompts that are within score_threshold_pct% of the best score
            threshold = best_score * (1 - score_threshold_pct / 100)
            close_to_best = [p for p in sorted_prompts if p["score"] >= threshold]

            # For prompts with very close scores, favor shorter ones
            # Sort by score first, then by prompt length (ascending)
            close_to_best.sort(key=lambda x: (-x["score"], x["prompt_length"]))

            # Take the top n from the sorted list
            top_n_prompts_for_model[dataset_type] = close_to_best[:n]

        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


def get_best_prompt_for_each_model(input_data):
    # Use get_best_n_prompts_for_each_model with n=1 to get the best prompt for each model
    best_n_prompts = get_best_n_prompts_for_each_model(input_data, n=1)

    # Convert the format to match the expected output
    best_prompts = {}
    for model, dataset_dict in best_n_prompts.items():
        best_prompts[model] = {}
        for dataset_type, prompts in dataset_dict.items():
            # Since we requested n=1, there's only one prompt in the list
            best_prompt = prompts[0]
            best_prompts[model][dataset_type] = {
                "prompt_type": best_prompt["prompt_type"],
                "score": best_prompt["score"],
            }

    return best_prompts

In [8]:
# Get the best prompt type for each model
best_n_prompt_types = get_best_n_prompts_for_each_model(total_results, n=3)

# Print the results as a formatted table using pandas

for model, dataset_dict in best_n_prompt_types.items():
    print(f"\n{'-' * 80}\nModel: {model}")
    for dataset_type, prompts in dataset_dict.items():
        print(f"\nDataset: {dataset_type}")

        # Create a DataFrame from the prompts data
        df = pd.DataFrame(prompts)

        # Rename columns for better display
        df = df.rename(
            columns={
                "prompt_type": "Prompt Type",
                "prompt_length": "Prompt Length",
                "score": "Score",
            }
        )

        # Format the score column to 4 decimal places
        df["Score"] = df["Score"].map("{:.4f}".format)

        # Display the DataFrame
        display(df)

# Save the best prompt types
with open("results/best_system_prompts_by_model.pkl", "wb") as f:
    best_prompt_types = get_best_prompt_for_each_model(total_results)
    pickle.dump(best_prompt_types, f)


--------------------------------------------------------------------------------
Model: llama3.1:8b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,59.6774
1,pattern-matching,176,56.4516
2,metaphor,187,56.4516



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,57.1429
1,creative,193,55.102



--------------------------------------------------------------------------------
Model: llama3.2:1b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,19.3548



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,8.1633



--------------------------------------------------------------------------------
Model: llama3.2:3b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,41.9355
1,assumption-challenge,173,40.3226
2,elimination,182,40.3226



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,perspective-shift,179,36.7347



--------------------------------------------------------------------------------
Model: phi3.5:3.8b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,25.8065



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,46.9388



--------------------------------------------------------------------------------
Model: phi4:14b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default,24,77.4194
1,default-improved,160,77.4194



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,67.3469



--------------------------------------------------------------------------------
Model: qwen2.5:0.5b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,29.0323



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,assumption-challenge,173,34.6939
1,common-sense,180,34.6939
2,step-by-step,251,34.6939



--------------------------------------------------------------------------------
Model: qwen2.5:1.5b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,assumption-challenge,173,29.0323
1,metaphor,187,29.0323
2,common-sense,180,27.4194



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default,24,10.2041
1,common-sense,180,10.2041
2,creative,193,10.2041



--------------------------------------------------------------------------------
Model: qwen2.5:3b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,27.4194



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,26.5306



--------------------------------------------------------------------------------
Model: qwen2.5:7b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,70.9677
1,confidence,178,69.3548
2,elimination,182,69.3548



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,14.2857



--------------------------------------------------------------------------------
Model: qwen2.5:14b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,77.4194



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,59.1837



--------------------------------------------------------------------------------
Model: qwen2.5:32b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,metaphor,187,74.1935
1,confidence,178,72.5806



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,61.2245
1,metaphor,187,61.2245



--------------------------------------------------------------------------------
Model: gemma2:2b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,elimination,182,51.6129



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,30.6122



--------------------------------------------------------------------------------
Model: gemma2:9b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,83.871
1,metaphor,187,83.871
2,creative,193,83.871



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,creative,193,73.4694
1,pattern-matching,176,71.4286
2,default-improved,160,69.3878



--------------------------------------------------------------------------------
Model: gemma2:27b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,90.3226
1,assumption-challenge,173,90.3226
2,perspective-shift,179,90.3226



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,confidence,178,79.5918
1,default-improved,160,77.551
2,pattern-matching,176,77.551



--------------------------------------------------------------------------------
Model: mistral-nemo:12b

Dataset: sp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,default-improved,160,61.2903
1,creative,193,59.6774
2,confidence,178,58.0645



Dataset: wp


Unnamed: 0,Prompt Type,Prompt Length,Score
0,step-by-step,251,38.7755


In [9]:
def display_best_model_prompt(results):
    # Create a list to store the data for the DataFrame
    data = []
    for model, dataset_dict in results.items():
        data.append(
            {
                "Model": model,
                "SP": dataset_dict["sp"]["prompt_type"],
                "WP": dataset_dict["wp"]["prompt_type"],
            }
        )

    # Create a DataFrame and display it
    df = pd.DataFrame(data)
    display(df)

In [10]:
with open("results/best_system_prompts_by_model_naive.pkl", "rb") as f:
    best_system_prompts_by_model_naive = pickle.load(f)
    display_best_model_prompt(best_system_prompts_by_model_naive)

with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_system_prompts_by_model = pickle.load(f)
    display_best_model_prompt(best_system_prompts_by_model)


# Create a function to compare the two sets of best prompts
def compare_best_prompts(naive_results, optimized_results):
    # Create a list to store the comparison data
    comparison_data = []
    changed_models = []

    for model in naive_results:
        # Get the prompt types for each dataset from both results
        naive_sp = naive_results[model]["sp"]["prompt_type"]
        naive_wp = naive_results[model]["wp"]["prompt_type"]

        optimized_sp = optimized_results[model]["sp"]["prompt_type"]
        optimized_wp = optimized_results[model]["wp"]["prompt_type"]

        # Check if there are differences
        sp_changed = naive_sp != optimized_sp
        wp_changed = naive_wp != optimized_wp

        # Only add to comparison data if there was a change
        if sp_changed or wp_changed:
            comparison_data.append(
                {
                    "Model": model,
                    "Naive SP": naive_sp,
                    "Optimized SP": optimized_sp,
                    "SP Changed": sp_changed,
                    "Naive WP": naive_wp,
                    "Optimized WP": optimized_wp,
                    "WP Changed": wp_changed,
                }
            )

            changed_models.append(model)

    # Create and return the DataFrame
    return pd.DataFrame(comparison_data), changed_models


# Display the comparison
comparison_df, changed_models = compare_best_prompts(
    best_system_prompts_by_model_naive, best_system_prompts_by_model
)
display(comparison_df)

Unnamed: 0,Model,SP,WP
0,llama3.1:8b,confidence,default-improved
1,llama3.2:1b,step-by-step,step-by-step
2,llama3.2:3b,default-improved,perspective-shift
3,phi3.5:3.8b,elimination,creative
4,phi4:14b,default-improved,default-improved
5,qwen2.5:0.5b,elimination,common-sense
6,qwen2.5:1.5b,metaphor,common-sense
7,qwen2.5:3b,step-by-step,step-by-step
8,qwen2.5:7b,default-improved,elimination
9,qwen2.5:14b,confidence,creative


Unnamed: 0,Model,SP,WP
0,llama3.1:8b,confidence,default-improved
1,llama3.2:1b,step-by-step,step-by-step
2,llama3.2:3b,default-improved,perspective-shift
3,phi3.5:3.8b,elimination,creative
4,phi4:14b,default,default-improved
5,qwen2.5:0.5b,elimination,assumption-challenge
6,qwen2.5:1.5b,assumption-challenge,default
7,qwen2.5:3b,step-by-step,step-by-step
8,qwen2.5:7b,default-improved,elimination
9,qwen2.5:14b,confidence,creative


Unnamed: 0,Model,Naive SP,Optimized SP,SP Changed,Naive WP,Optimized WP,WP Changed
0,phi4:14b,default-improved,default,True,default-improved,default-improved,False
1,qwen2.5:0.5b,elimination,elimination,False,common-sense,assumption-challenge,True
2,qwen2.5:1.5b,metaphor,assumption-challenge,True,common-sense,default,True
3,qwen2.5:32b,metaphor,metaphor,False,metaphor,default-improved,True
4,gemma2:9b,metaphor,default-improved,True,creative,creative,False
5,gemma2:27b,perspective-shift,default-improved,True,confidence,confidence,False
