In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [3]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": string.ascii_uppercase[riddle_question.label],
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [4]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://50.173.30.254:40053"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b-instruct-q8_0"),  # => 9 GB
        # Llama3.2
        model_builder.build_model("llama3.2:1b-instruct-fp16"),  # => 2.5 GB
        model_builder.build_model("llama3.2:3b-instruct-fp16"),  # => 6.4 GB
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b-mini-instruct-fp16"),  # => 7.6 GB
        # Phi4
        model_builder.build_model("phi4:14b-q8_0"),  # => 16 GB
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b-instruct-fp16"),  # => 1 GB
        model_builder.build_model("qwen2.5:1.5b-instruct-fp16"),  # => 3.1 GB
        model_builder.build_model("qwen2.5:3b-instruct-fp16"),  # => 6.2 GB
        model_builder.build_model("qwen2.5:7b-instruct-q8_0"),  # => 8.1 GB
        model_builder.build_model("qwen2.5:14b-instruct-q8_0"),  # => 16 GB
        model_builder.build_model("qwen2.5:32b-instruct-q4_K_M"),  # => 20 GB
        # Gemma2
        model_builder.build_model("gemma2:2b-instruct-fp16"),  # => 5.2 GB
        model_builder.build_model("gemma2:9b-instruct-q8_0"),  # => 9.8 GB
        model_builder.build_model("gemma2:27b-instruct-q4_K_M"),  # => 22 GB
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b-instruct-2407-q8_0"),  # => 13 GB
    ]
)

2025-03-12 10:50:42,736 - INFO - Initialized executor with 15 models.


In [5]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)

maximal_n = 8


def create_n_shot_dataset(
    data: list[RiddleQuestion],
    name: str,
    example_count: int = 10,
) -> tuple[list[RiddleQuestion], Dataset]:
    """
    Create a few-shot learning dataset by selecting diverse examples and the remaining data for testing.

    Args:
        data: List of riddle questions
        name: Name of the dataset
        example_count: Number of examples to use for few-shot learning

    Returns:
        tuple: (examples for few-shot learning, remaining dataset for testing)
    """
    # Group data by answer choice
    answer_groups = {}
    for i, question in enumerate(data):
        answer = question.label
        if answer not in answer_groups:
            answer_groups[answer] = []
        answer_groups[answer].append(i)

    # Select diverse examples for few-shot learning
    example_indices = []
    answers = list(answer_groups.keys())

    # Distribute examples evenly across answer choices
    while len(example_indices) < example_count and answers:
        for answer in list(answers):  # Use a copy to safely modify during iteration
            if answer_groups[answer]:
                example_indices.append(answer_groups[answer].pop(0))
                if len(example_indices) >= example_count:
                    break
            else:
                answers.remove(answer)

    # If we still need more examples, take randomly from remaining data
    if len(example_indices) < example_count:
        remaining_indices = [i for i in range(len(data)) if i not in example_indices]
        np.random.shuffle(remaining_indices)
        example_indices.extend(
            remaining_indices[: example_count - len(example_indices)]
        )

    # Get the examples
    examples = [data[i] for i in sorted(example_indices[:example_count])]

    # Create dataset from all remaining data (not used as examples)
    remaining_indices = [i for i in range(len(data)) if i not in example_indices]
    test_dataset = Dataset(name=name, riddles=[data[i] for i in remaining_indices])

    return examples, test_dataset


# Create test datasets
sp_examples, sp_data = create_n_shot_dataset(dataset.sp, "sp", example_count=maximal_n)
wp_examples, wp_data = create_n_shot_dataset(dataset.wp, "wp", example_count=maximal_n)

# Prepare executor data
executor_data = [sp_data, wp_data]

### Few Shot Helpers


In [6]:
from collections.abc import Callable

import dill as pickle
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate

from scripts.prompt_helpers import TemplateNameType, get_few_shot_chat_template

# Get the best prompt type for each model
with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)

with open("results/best_n_value_by_model.pkl", "rb") as f:
    best_n_value_by_model = pickle.load(f)

with open("results/best_n_value_by_model_system_prompt.pkl", "rb") as f:
    best_n_value_by_model_system_prompt = pickle.load(f)


def print_n_shot_comparison_table():
    """
    Prints a comparison table showing the best n-shot values for each model
    with default system prompt vs. optimized system prompt.
    """
    # Create data for the DataFrame
    data = []
    model_names = sorted(best_n_value_by_model.keys())

    for model in model_names:
        for dataset_name in ["sp", "wp"]:
            default_n = best_n_value_by_model[model][dataset_name]
            system_prompt_n = best_n_value_by_model_system_prompt[model][dataset_name]

            # Determine change direction
            if default_n != system_prompt_n:
                difference = "↑" if system_prompt_n > default_n else "↓"
                system_prompt_display = f"{system_prompt_n} {difference}"
            else:
                system_prompt_display = f"{system_prompt_n}"

            data.append(
                {
                    "Model": model,
                    "Dataset": dataset_name,
                    "Default Prompt N": default_n,
                    "Optimized System Prompt N": system_prompt_display,
                }
            )

    df = pd.DataFrame(data)
    display(df)
    print(
        "\nNote: ↑ indicates an increase in optimal n-shot examples with system prompt optimization"
    )
    print(
        "      ↓ indicates a decrease in optimal n-shot examples with system prompt optimization"
    )


# Print the comparison table
print_n_shot_comparison_table()


def few_shot_prompt_template_generator_baseline(
    model_name: str, dataset: Dataset
) -> Callable[[str], ChatPromptTemplate]:
    if dataset.name == "sp":
        few_shot_examples = sp_examples
    elif dataset.name == "wp":
        few_shot_examples = wp_examples
    else:
        raise ValueError(f"Unknown dataset: {dataset.name}")

    # Cleanup model name to match the keys in the best_prompt_types dictionary
    model_name = model_name[0 : model_name.index("b-") + 1]

    # Use default system prompt for baseline
    best_system_template_name: TemplateNameType = "default"

    # Get the best number of shots for the model and dataset
    number_of_shots = best_n_value_by_model[model_name][dataset.name]
    template = get_few_shot_chat_template(
        few_shot_examples,
        args_generator,
        best_system_template_name,
        number_of_shots,
    )
    return template


def few_shot_prompt_template_generator_system_prompt(
    model_name: str, dataset: Dataset
) -> Callable[[str], ChatPromptTemplate]:
    if dataset.name == "sp":
        few_shot_examples = sp_examples
    elif dataset.name == "wp":
        few_shot_examples = wp_examples
    else:
        raise ValueError(f"Unknown dataset: {dataset.name}")

    # Cleanup model name to match the keys in the best_prompt_types dictionary
    model_name = model_name[0 : model_name.index("b-") + 1]

    # Get the best system template name for the model and dataset
    best_system_template_name: TemplateNameType = best_prompt_types[model_name][
        dataset.name
    ]["prompt_type"]

    # Get the best number of shots for the model and dataset
    number_of_shots = best_n_value_by_model_system_prompt[model_name][dataset.name]

    template = get_few_shot_chat_template(
        few_shot_examples,
        args_generator,
        best_system_template_name,
        number_of_shots,
    )
    return template

Unnamed: 0,Model,Dataset,Default Prompt N,Optimized System Prompt N
0,gemma2:27b,sp,2,7 ↑
1,gemma2:27b,wp,8,6 ↓
2,gemma2:2b,sp,1,3 ↑
3,gemma2:2b,wp,7,6 ↓
4,gemma2:9b,sp,2,1 ↓
5,gemma2:9b,wp,3,2 ↓
6,llama3.1:8b,sp,3,4 ↑
7,llama3.1:8b,wp,4,1 ↓
8,llama3.2:1b,sp,8,7 ↓
9,llama3.2:1b,wp,8,5 ↓



Note: ↑ indicates an increase in optimal n-shot examples with system prompt optimization
      ↓ indicates a decrease in optimal n-shot examples with system prompt optimization


In [7]:
results_baseline = await executor.aexecute(
    executor_data,
    lambda model_name, dataset: few_shot_prompt_template_generator_baseline(
        model_name, dataset
    ),
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    run_name="baseline-few-shot-evaluation",
)

2025-03-12 10:50:42,957 - INFO - Starting execution 'baseline-few-shot-evaluation': 2 dataset(s) x 15 model(s) = 16545 riddle evaluations


baseline-few-shot-evaluation:   0%|          | 0/16545 [00:00<?, ?it/s]

2025-03-12 12:45:29,260 - INFO - Saving results to results/baseline-few-shot-evaluation/baseline-few-shot-evaluation_results.pkl
2025-03-12 12:45:41,552 - INFO - Execution 'baseline-few-shot-evaluation' completed successfully.


In [8]:
results_system_prompt = await executor.aexecute(
    executor_data,
    lambda model_name, dataset: few_shot_prompt_template_generator_system_prompt(
        model_name, dataset
    ),
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    run_name="system-optimized-few-shot-evaluation",
)

2025-03-12 12:45:41,564 - INFO - Starting execution 'system-optimized-few-shot-evaluation': 2 dataset(s) x 15 model(s) = 16545 riddle evaluations


system-optimized-few-shot-evaluation:   0%|          | 0/16545 [00:00<?, ?it/s]

2025-03-12 14:26:31,671 - INFO - Saving results to results/system-optimized-few-shot-evaluation/system-optimized-few-shot-evaluation_results.pkl
2025-03-12 14:26:44,267 - INFO - Execution 'system-optimized-few-shot-evaluation' completed successfully.
