In [10]:
from scripts import setup_environment

setup_environment()

In [11]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [12]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": string.ascii_uppercase[riddle_question.label],
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [13]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://107.222.215.224:23563"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-03-03 23:09:00,039 - INFO - Initialized executor with 15 models.


In [14]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)

maximal_n = 8


def create_test_dataset(
    data: list[RiddleQuestion],
    name: str,
    percentage: float = 0.1,
    example_count: int = 10,
) -> tuple[list[RiddleQuestion], Dataset]:
    """
    Create a test dataset by randomly sampling a percentage of the original data.
    Also returns examples for few-shot learning with diverse answers.

    Args:
        data: List of riddle questions
        name: Name of the dataset
        percentage: Percentage of data to use for testing
        example_count: Number of examples to use for few-shot learning

    Returns:
        tuple: (examples for few-shot learning, test dataset)
    """
    # Group data by answer choice
    answer_groups = {}
    for i, question in enumerate(data):
        answer = question.label
        if answer not in answer_groups:
            answer_groups[answer] = []
        answer_groups[answer].append(i)

    # Select diverse examples for few-shot learning
    example_indices = []
    answers = list(answer_groups.keys())

    # Distribute examples evenly across answer choices
    while len(example_indices) < example_count and answers:
        for answer in list(answers):  # Use a copy to safely modify during iteration
            if answer_groups[answer]:
                example_indices.append(answer_groups[answer].pop(0))
                if len(example_indices) >= example_count:
                    break
            else:
                answers.remove(answer)

        # If we don't have enough examples yet and ran out of diverse answers,
        # just add remaining from whatever is available
        if len(example_indices) < example_count and not any(answer_groups.values()):
            break

    # If we still need more examples, take from the beginning
    if len(example_indices) < example_count:
        remaining_indices = [i for i in range(len(data)) if i not in example_indices]
        example_indices.extend(
            remaining_indices[: example_count - len(example_indices)]
        )

    examples = [data[i] for i in sorted(example_indices[:example_count])]

    # Sample from the remaining data for testing
    remaining_indices = [
        i for i in range(len(data)) if i not in example_indices[:example_count]
    ]
    remaining_data = [data[i] for i in remaining_indices]

    indices = np.random.choice(
        len(remaining_data), size=int(len(remaining_data) * percentage), replace=False
    )
    test_dataset = Dataset(name=name, riddles=[remaining_data[i] for i in indices])

    return examples, test_dataset


# Create test datasets
sp_examples, sp_data = create_test_dataset(dataset.sp, "sp", example_count=maximal_n)
wp_examples, wp_data = create_test_dataset(dataset.wp, "wp", example_count=maximal_n)

# Prepare executor data
executor_data = [sp_data, wp_data]

### Few Shot Helpers


In [15]:
from collections.abc import Callable

import dill as pickle
from langchain_core.prompts import ChatPromptTemplate

from scripts.prompt_helpers import TemplateNameType, get_few_shot_chat_template

# Get the best prompt type for each model
with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)


def few_shot_prompt_template_generator(
    model_name: str, dataset: Dataset, number_of_shots: int
) -> Callable[[str], ChatPromptTemplate]:
    if dataset.name == "sp":
        few_shot_examples = sp_examples
    elif dataset.name == "wp":
        few_shot_examples = wp_examples
    else:
        raise ValueError(f"Unknown dataset: {dataset.name}")

    best_system_template_name: TemplateNameType = best_prompt_types[model_name][
        dataset.name
    ]["prompt_type"]

    template = get_few_shot_chat_template(
        few_shot_examples,
        args_generator,
        best_system_template_name,
        number_of_shots,
    )
    return template

In [16]:
for i in range(1, maximal_n + 1):
    results = await executor.aexecute(
        executor_data,
        lambda model_name,
        dataset,
        number_of_shots=i: few_shot_prompt_template_generator(
            model_name, dataset, number_of_shots
        ),
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="few_shot_obtain_best_n",
        file_name_suffix=f"n={i}",
    )

2025-03-03 23:09:00,140 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=1'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-1):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:00,192 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=2'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-2):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:00,240 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=3'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-3):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:00,850 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=4'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-4):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:00,925 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=5'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-5):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:01,012 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=6'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-6):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:01,627 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=7'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-7):   0%|          | 0/1635 [00:00<?, ?it/s]

2025-03-03 23:09:01,750 - INFO - Starting execution 'few-shot-obtain-best-n with suffix 'n=8'': 2 dataset(s) x 15 model(s) = 1635 riddle evaluations


few-shot-obtain-best-n(n-8):   0%|          | 0/1635 [00:00<?, ?it/s]

## Eval results


In [17]:
import os
import glob
import pickle
from pathlib import Path

# Define the results directory path
results_dir = Path("results/few-shot-obtain-best-n")

# Get all result files
result_files = glob.glob(str(results_dir / "few-shot-obtain-best-n_n-*_results.pkl"))

# Load all results into a dictionary
# The first key is the suffix (technique name)
total_results = {}

for file_path in result_files:
    # Extract the suffix from the filename
    suffix = os.path.basename(file_path).split("_")[1]

    # Load the results from the pickle file
    with open(file_path, "rb") as f:
        wrapped_results = pickle.load(f)
        total_results[suffix] = wrapped_results.results

print(f"Loaded {len(total_results)} result sets from disk.")

Loaded 8 result sets from disk.


In [18]:
import heapq

import numpy as np
import pandas as pd
from IPython.display import display

from scripts.evaluation import eval_model_results


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score = eval_model_results(result)

                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "prompt_type": prompt_type,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=3):
    best_prompts = {}

    # Iterate through each model
    for prompt_type, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score = eval_model_results(result)

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type, score, and length to the list
                best_prompts[model][dataset_type].append(
                    {
                        "prompt_type": prompt_type,
                        "score": score,
                    }
                )

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Get the top n prompts by sorting the list based on score (highest score first)
            sorted_prompts = heapq.nlargest(n, prompts, key=lambda x: x["score"])
            top_n_prompts_for_model[dataset_type] = sorted_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Get the best prompt type for each model
best_prompt_types = get_best_n_prompts_for_each_model(total_results, n=10)

# Print the results as a formatted table using pandas

for model, dataset_dict in best_prompt_types.items():
    print(f"\n{'-' * 80}\nModel: {model}")
    for dataset_type, prompts in dataset_dict.items():
        print(f"\nDataset: {dataset_type}")

        # Create a DataFrame from the prompts data
        df = pd.DataFrame(prompts)

        # Rename columns for better display
        df = df.rename(
            columns={
                "prompt_type": "Prompt Type",
                "score": "Score",
            }
        )

        # Format the score column to 4 decimal places
        df["Score"] = df["Score"].map("{:.4f}".format)

        # Display the DataFrame
        display(df)

# Save the best prompt types
# with open("results/best_system_prompts_by_model.pkl", "wb") as f:
#     pickle.dump(best_prompt_types, f)


--------------------------------------------------------------------------------
Model: llama3.1:8b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-4,70.4918
1,n-5,68.8525
2,n-6,67.2131
3,n-3,67.2131
4,n-8,67.2131
5,n-1,67.2131
6,n-7,65.5738
7,n-2,65.5738



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-1,66.6667
1,n-5,58.3333
2,n-2,56.25
3,n-6,52.0833
4,n-4,50.0
5,n-3,50.0
6,n-7,47.9167
7,n-8,45.8333



--------------------------------------------------------------------------------
Model: llama3.2:1b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-7,24.5902
1,n-5,22.9508
2,n-8,22.9508
3,n-3,19.6721
4,n-2,18.0328
5,n-6,16.3934
6,n-4,16.3934
7,n-1,13.1148



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-5,27.0833
1,n-3,25.0
2,n-4,20.8333
3,n-8,20.8333
4,n-2,18.75
5,n-1,18.75
6,n-7,16.6667
7,n-6,12.5



--------------------------------------------------------------------------------
Model: llama3.2:3b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-8,49.1803
1,n-7,47.541
2,n-4,45.9016
3,n-2,45.9016
4,n-6,40.9836
5,n-1,40.9836
6,n-5,39.3443
7,n-3,37.7049



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-8,33.3333
1,n-3,31.25
2,n-7,31.25
3,n-5,29.1667
4,n-2,27.0833
5,n-6,22.9167
6,n-1,22.9167
7,n-4,20.8333



--------------------------------------------------------------------------------
Model: phi3.5:3.8b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-4,47.541
1,n-7,44.2623
2,n-5,29.5082
3,n-6,19.6721
4,n-2,19.6721
5,n-3,18.0328
6,n-8,18.0328
7,n-1,1.6393



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-7,41.6667
1,n-8,39.5833
2,n-4,29.1667
3,n-1,25.0
4,n-3,20.8333
5,n-6,16.6667
6,n-2,16.6667
7,n-5,12.5



--------------------------------------------------------------------------------
Model: phi4:14b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-6,81.9672
1,n-3,80.3279
2,n-4,78.6885
3,n-8,75.4098
4,n-5,73.7705
5,n-7,73.7705
6,n-1,72.1311
7,n-2,70.4918



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-3,64.5833
1,n-7,64.5833
2,n-6,62.5
3,n-1,60.4167
4,n-2,56.25
5,n-8,56.25
6,n-4,54.1667
7,n-5,52.0833



--------------------------------------------------------------------------------
Model: qwen2.5:0.5b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-6,34.4262
1,n-5,32.7869
2,n-7,32.7869
3,n-1,29.5082
4,n-2,26.2295
5,n-4,24.5902
6,n-3,22.9508
7,n-8,21.3115



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-6,58.3333
1,n-5,45.8333
2,n-8,45.8333
3,n-7,39.5833
4,n-4,35.4167
5,n-3,29.1667
6,n-1,29.1667
7,n-2,27.0833



--------------------------------------------------------------------------------
Model: qwen2.5:1.5b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-5,59.0164
1,n-1,59.0164
2,n-3,55.7377
3,n-2,55.7377
4,n-8,55.7377
5,n-7,54.0984
6,n-6,50.8197
7,n-4,47.541



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-2,50.0
1,n-1,50.0
2,n-7,47.9167
3,n-3,45.8333
4,n-5,43.75
5,n-6,43.75
6,n-8,41.6667
7,n-4,33.3333



--------------------------------------------------------------------------------
Model: qwen2.5:3b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-5,62.2951
1,n-3,62.2951
2,n-8,62.2951
3,n-4,60.6557
4,n-7,60.6557
5,n-6,59.0164
6,n-2,59.0164
7,n-1,52.459



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-1,43.75
1,n-2,37.5
2,n-8,37.5
3,n-6,33.3333
4,n-7,33.3333
5,n-5,31.25
6,n-4,31.25
7,n-3,31.25



--------------------------------------------------------------------------------
Model: qwen2.5:7b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-5,73.7705
1,n-6,72.1311
2,n-7,72.1311
3,n-4,70.4918
4,n-8,70.4918
5,n-3,67.2131
6,n-2,67.2131
7,n-1,67.2131



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-8,54.1667
1,n-6,52.0833
2,n-4,50.0
3,n-5,47.9167
4,n-7,41.6667
5,n-3,39.5833
6,n-2,39.5833
7,n-1,16.6667



--------------------------------------------------------------------------------
Model: qwen2.5:14b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-4,65.5738
1,n-8,62.2951
2,n-5,59.0164
3,n-3,59.0164
4,n-7,57.377
5,n-2,57.377
6,n-1,57.377
7,n-6,55.7377



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-6,60.4167
1,n-4,60.4167
2,n-7,58.3333
3,n-2,58.3333
4,n-8,56.25
5,n-3,54.1667
6,n-1,54.1667
7,n-5,47.9167



--------------------------------------------------------------------------------
Model: qwen2.5:32b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-3,73.7705
1,n-8,73.7705
2,n-5,72.1311
3,n-7,72.1311
4,n-2,70.4918
5,n-1,70.4918
6,n-4,68.8525
7,n-6,65.5738



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-3,68.75
1,n-6,64.5833
2,n-4,64.5833
3,n-2,64.5833
4,n-7,62.5
5,n-8,62.5
6,n-5,60.4167
7,n-1,60.4167



--------------------------------------------------------------------------------
Model: gemma2:2b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-3,42.623
1,n-2,40.9836
2,n-8,40.9836
3,n-5,39.3443
4,n-1,39.3443
5,n-4,37.7049
6,n-7,37.7049
7,n-6,36.0656



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-6,45.8333
1,n-5,43.75
2,n-8,43.75
3,n-4,41.6667
4,n-7,39.5833
5,n-2,39.5833
6,n-3,37.5
7,n-1,31.25



--------------------------------------------------------------------------------
Model: gemma2:9b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-1,77.0492
1,n-2,70.4918
2,n-5,68.8525
3,n-6,67.2131
4,n-4,67.2131
5,n-7,67.2131
6,n-3,65.5738
7,n-8,63.9344



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-3,72.9167
1,n-2,72.9167
2,n-5,70.8333
3,n-6,70.8333
4,n-7,68.75
5,n-1,68.75
6,n-4,66.6667
7,n-8,66.6667



--------------------------------------------------------------------------------
Model: gemma2:27b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-7,88.5246
1,n-5,86.8852
2,n-8,86.8852
3,n-6,83.6066
4,n-4,83.6066
5,n-3,81.9672
6,n-1,80.3279
7,n-2,77.0492



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-6,70.8333
1,n-7,68.75
2,n-4,66.6667
3,n-8,66.6667
4,n-1,64.5833
5,n-2,62.5
6,n-5,58.3333
7,n-3,58.3333



--------------------------------------------------------------------------------
Model: mistral-nemo:12b

Dataset: sp


Unnamed: 0,Prompt Type,Score
0,n-8,67.2131
1,n-5,65.5738
2,n-7,65.5738
3,n-6,62.2951
4,n-4,59.0164
5,n-2,57.377
6,n-3,55.7377
7,n-1,55.7377



Dataset: wp


Unnamed: 0,Prompt Type,Score
0,n-6,45.8333
1,n-7,45.8333
2,n-1,45.8333
3,n-2,41.6667
4,n-8,35.4167
5,n-4,33.3333
6,n-3,33.3333
7,n-5,31.25
