In [19]:
from scripts import setup_environment

setup_environment()

In [20]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [21]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": string.ascii_uppercase[riddle_question.label],
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [22]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://50.173.30.254:40106"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-03-12 10:20:20,637 - INFO - Initialized executor with 15 models.


In [23]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)

maximal_n = 8


def create_test_dataset(
    data: list[RiddleQuestion],
    name: str,
    percentage: float = 0.1,
    example_count: int = 10,
) -> tuple[list[RiddleQuestion], Dataset]:
    """
    Create a test dataset by randomly sampling a percentage of the original data.
    Also returns examples for few-shot learning with diverse answers.

    Args:
        data: List of riddle questions
        name: Name of the dataset
        percentage: Percentage of data to use for testing
        example_count: Number of examples to use for few-shot learning

    Returns:
        tuple: (examples for few-shot learning, test dataset)
    """
    # Group data by answer choice
    answer_groups = {}
    for i, question in enumerate(data):
        answer = question.label
        if answer not in answer_groups:
            answer_groups[answer] = []
        answer_groups[answer].append(i)

    # Select diverse examples for few-shot learning
    example_indices = []
    answers = list(answer_groups.keys())

    # Distribute examples evenly across answer choices
    while len(example_indices) < example_count and answers:
        for answer in list(answers):  # Use a copy to safely modify during iteration
            if answer_groups[answer]:
                example_indices.append(answer_groups[answer].pop(0))
                if len(example_indices) >= example_count:
                    break
            else:
                answers.remove(answer)

        # If we don't have enough examples yet and ran out of diverse answers,
        # just add remaining from whatever is available
        if len(example_indices) < example_count and not any(answer_groups.values()):
            break

    # If we still need more examples, take from the beginning
    if len(example_indices) < example_count:
        remaining_indices = [i for i in range(len(data)) if i not in example_indices]
        example_indices.extend(
            remaining_indices[: example_count - len(example_indices)]
        )

    examples = [data[i] for i in sorted(example_indices[:example_count])]

    # Sample from the remaining data for testing
    remaining_indices = [
        i for i in range(len(data)) if i not in example_indices[:example_count]
    ]
    remaining_data = [data[i] for i in remaining_indices]

    indices = np.random.choice(
        len(remaining_data), size=int(len(remaining_data) * percentage), replace=False
    )
    test_dataset = Dataset(name=name, riddles=[remaining_data[i] for i in indices])

    return examples, test_dataset


# Create test datasets
sp_examples, sp_data = create_test_dataset(dataset.sp, "sp", example_count=maximal_n)
wp_examples, wp_data = create_test_dataset(dataset.wp, "wp", example_count=maximal_n)

# Prepare executor data
executor_data = [sp_data, wp_data]

### Few Shot Helpers


In [24]:
from collections.abc import Callable

import dill as pickle
from langchain_core.prompts import ChatPromptTemplate

from scripts.prompt_helpers import TemplateNameType, get_few_shot_chat_template

# Get the best prompt type for each model
with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)


def few_shot_prompt_template_generator(
    model_name: str, dataset: Dataset, number_of_shots: int
) -> Callable[[str], ChatPromptTemplate]:
    if dataset.name == "sp":
        few_shot_examples = sp_examples
    elif dataset.name == "wp":
        few_shot_examples = wp_examples
    else:
        raise ValueError(f"Unknown dataset: {dataset.name}")

    best_system_template_name: TemplateNameType = best_prompt_types[model_name][
        dataset.name
    ]["prompt_type"]

    template = get_few_shot_chat_template(
        few_shot_examples,
        args_generator,
        best_system_template_name,
        number_of_shots,
    )
    return template

In [25]:
for i in range(1, maximal_n + 1):
    results = await executor.aexecute(
        executor_data,
        lambda model_name,
        dataset,
        number_of_shots=i: few_shot_prompt_template_generator(
            model_name, dataset, number_of_shots
        ),
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="few-shot-obtain-best-n-system-prompt",
        file_name_suffix=f"n={i}",
    )

2025-03-12 10:20:20,721 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=1'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-1):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:20,776 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=2'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-2):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:20,829 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=3'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-3):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:21,331 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=4'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-4):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:21,433 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=5'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-5):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:21,988 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=6'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-6):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:22,097 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=7'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-7):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-12 10:20:22,692 - INFO - Starting execution 'few-shot-obtain-best-n-system-prompt with suffix 'n=8'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n-system-prompt(n-8):   0%|          | 0/1635 [00:00<?, ?it/s]

## Eval results


In [28]:
import os
import glob
import pickle
from pathlib import Path

# Define the results directory path
results_dir = Path("results/few-shot-obtain-best-n-system-prompt")

# Get all result files
result_files = glob.glob(str(results_dir / "few-shot-obtain-best-n*_n-*_results.pkl"))

# Load all results into a dictionary
# The first key is the suffix (technique name)
total_results = {}

for file_path in result_files:
    # Extract the suffix from the filename
    suffix = os.path.basename(file_path).split("_")[1]

    # Load the results from the pickle file
    with open(file_path, "rb") as f:
        wrapped_results = pickle.load(f)
        total_results[suffix] = wrapped_results.results

print(f"Loaded {len(total_results)} result sets from disk.")

Loaded 8 result sets from disk.


In [29]:
import numpy as np

from scripts.evaluation import calculate_model_accuracy


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score_percentage_raw, _, score_percentage_postprocessed, _ = (
                    calculate_model_accuracy(result)
                )

                score = score_percentage_raw
                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "prompt_type": prompt_type,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=5):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score_percentage_raw, _, score_percentage_postprocessed, _ = (
                    calculate_model_accuracy(result)
                )
                score = score_percentage_raw

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type, score, and length to the list
                best_prompts[model][dataset_type].append(
                    {
                        "prompt_type": prompt_type,
                        "score": score,
                    }
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # First, sort all prompts by score (highest first)
            all_sorted_prompts = sorted(prompts, key=lambda x: x["score"], reverse=True)

            # Apply a penalty to higher n values when scores are close
            # This will reorder prompts to prefer lower n values when scores are within threshold
            penalized_prompts = []
            for prompt in all_sorted_prompts:
                n_value = int(prompt["prompt_type"].split("-")[1])
                # Calculate a penalized score that favors lower n values when scores are close
                # If two scores are within 3%, each additional n point reduces score by 0.5%
                penalized_score = prompt["score"]

                # Compare with all better-scoring prompts
                for better_prompt in all_sorted_prompts:
                    if better_prompt["score"] <= prompt["score"]:
                        continue

                    better_n = int(better_prompt["prompt_type"].split("-")[1])
                    score_diff = better_prompt["score"] - prompt["score"]

                    # If the score difference is small but n is significantly smaller
                    if score_diff <= 3.0 and n_value > better_n:
                        # Boost the score of the smaller n value prompt
                        penalized_score = better_prompt["score"] + 2.0
                        break

                penalized_prompts.append(
                    {
                        "prompt_type": prompt["prompt_type"],
                        "score": prompt["score"],
                        "penalized_score": penalized_score,
                        "n_value": n_value,
                    }
                )

            # Sort by penalized score first, then by original score, then by lower n value
            final_sorted_prompts = sorted(
                penalized_prompts,
                key=lambda x: (x["penalized_score"], x["score"], -x["n_value"]),
                reverse=True,
            )

            # Take the top n prompts after reordering
            top_prompts = []
            for p in final_sorted_prompts[:n]:
                top_prompts.append(
                    {"prompt_type": p["prompt_type"], "score": p["score"]}
                )

            # Make sure the final results are sorted by highest score on top
            top_prompts = sorted(top_prompts, key=lambda x: x["score"], reverse=True)

            top_n_prompts_for_model[dataset_type] = top_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Get the best prompt type for each model
best_prompt_types = get_best_n_prompts_for_each_model(total_results, n=4)

# Print the results as a formatted table using pandas

# for model, dataset_dict in best_prompt_types.items():
#     print(f"\n{'-' * 80}\nModel: {model}")
#     for dataset_type, prompts in dataset_dict.items():
#         print(f"\nDataset: {dataset_type}")

#         # Create a DataFrame from the prompts data
#         df = pd.DataFrame(prompts)

#         # Rename columns for better display
#         df = df.rename(
#             columns={
#                 "prompt_type": "Prompt Type",
#                 "score": "Score",
#             }
#         )

#         # Format the score column to 4 decimal places
#         df["Score"] = df["Score"].map("{:.4f}".format)

#         # Display the DataFrame
#         display(df)

# Extract the best prompt type (highest score) for each model and dataset
best_n_value_by_model = {}

for model, dataset_dict in best_prompt_types.items():
    best_n_value_by_model[model] = {}

    for dataset_name, prompts in dataset_dict.items():
        if prompts:
            # Get the prompt type with the highest score
            best_prompt_type = prompts[0]["prompt_type"]

            # Extract the n value from the prompt type (format is "n-X")
            n_value = int(best_prompt_type.split("-")[1])

            # Store in our dictionary with dataset as key and n as value
            best_n_value_by_model[model][dataset_name] = n_value

# Print the results
print("\nBest n value for each model:")
for model, dataset_dict in best_n_value_by_model.items():
    print(f"{model}:")
    for dataset_name, n_value in dataset_dict.items():
        print(f"  - {dataset_name}: n={n_value}")

# Save the best prompt types
with open("results/best_n_value_by_model_system-prompt.pkl", "wb") as f:
    pickle.dump(best_n_value_by_model, f)


Best n value for each model:
llama3.1:8b:
  - sp: n=4
  - wp: n=1
llama3.2:1b:
  - sp: n=7
  - wp: n=5
llama3.2:3b:
  - sp: n=8
  - wp: n=8
phi3.5:3.8b:
  - sp: n=4
  - wp: n=8
phi4:14b:
  - sp: n=6
  - wp: n=3
qwen2.5:0.5b:
  - sp: n=6
  - wp: n=6
qwen2.5:1.5b:
  - sp: n=1
  - wp: n=2
qwen2.5:3b:
  - sp: n=5
  - wp: n=2
qwen2.5:7b:
  - sp: n=5
  - wp: n=8
qwen2.5:14b:
  - sp: n=4
  - wp: n=4
qwen2.5:32b:
  - sp: n=3
  - wp: n=3
gemma2:2b:
  - sp: n=8
  - wp: n=6
gemma2:9b:
  - sp: n=1
  - wp: n=1
gemma2:27b:
  - sp: n=5
  - wp: n=6
mistral-nemo:12b:
  - sp: n=1
  - wp: n=2
