In [2]:
from scripts import setup_environment

setup_environment()

In [3]:
import textwrap

import dill as pickle
from langchain.prompts.chat import (
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are an AI assistant.",
    "default_improved": "You are an AI assistant specialized in solving lateral thinking questions. You will receive a question with multiple choices and must determine the correct answer using logical reasoning and problem-solving techniques.",
    "step_by_step": "You are a logical problem solver. Break down the question systematically, analyze each answer choice step by step, eliminate incorrect options, and select the best answer.",
    "creative": "You are a lateral thinker. Approach each question with flexible reasoning, exploring unconventional yet valid interpretations before selecting the best answer.",
    "elimination": "You are a strategic reasoner. First, identify and eliminate incorrect answer choices. Then, select the most logical remaining option.",
    "metaphor": "You are skilled in abstract reasoning. Consider both literal and metaphorical meanings in the question and choices before selecting the most insightful answer.",
    "confidence": "You are an analytical decision-maker. Assess the likelihood of correctness for each choice, score them internally, and select the answer with the highest confidence.",
    "perspective_shift": "You are a multi-perspective analyst. Evaluate the question from different angles, considering alternative interpretations before determining the best answer.",
    "common_sense": "You balance logic and practicality. Apply both structured reasoning and real-world common sense to determine the most reasonable answer.",
    "assumption_challenge": "You are a critical thinker. Identify and question hidden assumptions in the question and choices before selecting the answer that best challenges or aligns with them.",
    "pattern_matching": "You recognize patterns and relationships. Identify logical structures, recurring themes, or hidden connections in the question and choices before selecting the best answer.",
    "intuitive": "You combine intuition with logic. Generate an initial answer instinctively, then critically evaluate it for logical soundness before finalizing your choice.",
}


def get_system_prompt_template(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_human_prompt_template():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


best_prompt_types = None
with open("results/dict_best_system_prompt_for_models.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)

In [4]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [5]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": string.ascii_uppercase[riddle_question.label],
    }

    return template_args

In [6]:
# We now create the few shot exampel but following the best practices from https://python.langchain.com/docs/how_to/few_shot_examples_chat/
# Thus we do not provide the examples in the initial client prompt but as a message history of the user asking and the system answering

from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate


def get_few_shot_dataset(dataset: list[RiddleQuestion], number_of_shots: int = 4):
    riddles_as_examples = dataset[:number_of_shots]
    riddles_to_solve = dataset[number_of_shots:]
    return (riddles_as_examples, riddles_to_solve)


def get_few_shot_chat_template(
    model_name: str,
    dataset_name: str,
    dataset: list[RiddleQuestion],
    number_of_shots: int = 4,
):
    riddles_as_examples, riddles_to_solve = get_few_shot_dataset(
        dataset, number_of_shots
    )
    example_prompt = ChatPromptTemplate.from_messages(
        [
            get_human_prompt_template(),
            ("ai", "{answer}"),
        ]
    )
    few_shot_prompt_naive = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=[args_generator(example) for example in riddles_as_examples],
    )

    best_system_template_name = best_prompt_types[model_name][dataset_name][
        "prompt_type"
    ]
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [
            get_system_prompt_template(best_system_template_name),
            few_shot_prompt_naive,
            get_human_prompt_template(),
        ]
    )
    return (chat_prompt_template, riddles_to_solve)

In [7]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://108.179.129.43:31701"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b"),
        # Llama3.2
        model_builder.build_model("llama3.2:1b"),
        model_builder.build_model("llama3.2:3b"),
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b"),
        # Phi4
        model_builder.build_model("phi4:14b"),
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b"),
        model_builder.build_model("qwen2.5:1.5b"),
        model_builder.build_model("qwen2.5:3b"),
        model_builder.build_model("qwen2.5:7b"),
        model_builder.build_model("qwen2.5:14b"),
        model_builder.build_model("qwen2.5:32b"),
        # Gemma2
        model_builder.build_model("gemma2:2b"),
        model_builder.build_model("gemma2:9b"),
        model_builder.build_model("gemma2:27b"),
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b"),
    ]
)

2025-02-18 16:22:50,622 - INFO - Initialized executor with 15 models.


In [8]:
test_sample_size = 50
sp_data = dataset.sp[0:test_sample_size]
wp_data = dataset.wp[0:test_sample_size]

In [9]:
from pathlib import Path

max_shots = 4

total_results = {}
for n_shots in range(1, max_shots + 1):
    print(f"Running few-shot evaluation with {n_shots} shots")
    _, sp_riddles_for_eval = get_few_shot_dataset(sp_data, n_shots)
    _, wp_riddles_for_eval = get_few_shot_dataset(wp_data, n_shots)

    sp_results = await executor.aexecute(
        sp_riddles_for_eval,
        lambda x, n_shots=n_shots: get_few_shot_chat_template(
            x, "sp", sp_data, n_shots
        )[0],
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        result_file_name=f"sp_results_explore_best_few_shot_n_{n_shots}",
    )

    wp_results = await executor.aexecute(
        wp_riddles_for_eval,
        lambda x, n_shots=n_shots: get_few_shot_chat_template(
            x, "wp", wp_data, n_shots
        )[0],
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        result_file_name=f"wp_results_explore_best_few_shot_n_{n_shots}",
    )

    total_results[n_shots] = {"sp": sp_results, "wp": wp_results}

results_file = Path("results/results_test_4_explore_best_n_few_shot_prompts.pkl")
results_file.parent.mkdir(parents=True, exist_ok=True)

with results_file.open("wb") as f:
    pickle.dump(total_results, f)

2025-02-18 16:22:50,662 - INFO - Restored results from results file results/sp_results_explore_best_few_shot_n_1.pkl, skipping execution for this model!


2025-02-18 16:22:50,676 - INFO - Restored results from results file results/wp_results_explore_best_few_shot_n_1.pkl, skipping execution for this model!
2025-02-18 16:22:50,798 - INFO - Restored results from results file results/sp_results_explore_best_few_shot_n_2.pkl, skipping execution for this model!
2025-02-18 16:22:50,813 - INFO - Restored results from results file results/wp_results_explore_best_few_shot_n_2.pkl, skipping execution for this model!


Running few-shot evaluation with 1 shots
Running few-shot evaluation with 2 shots
Running few-shot evaluation with 3 shots


2025-02-18 16:22:50,964 - INFO - Restored results from results file results/sp_results_explore_best_few_shot_n_3.pkl, skipping execution for this model!
2025-02-18 16:22:50,984 - INFO - Restored results from results file results/wp_results_explore_best_few_shot_n_3.pkl, skipping execution for this model!
2025-02-18 16:22:51,177 - INFO - Restored results from results file results/sp_results_explore_best_few_shot_n_4.pkl, skipping execution for this model!


Running few-shot evaluation with 4 shots


2025-02-18 16:22:51,206 - INFO - Restored results from results file results/wp_results_explore_best_few_shot_n_4.pkl, skipping execution for this model!


## Eval results


In [13]:
import heapq

from scripts.evaluation import eval_model_results


def get_best_prompt_for_each_model(input_data):
    best_prompts = {}

    # Iterate through each model
    for n, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            # For each model, we need to track its best score
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Assume eval_results returns a score based on the result data
                score = eval_model_results(result)

                # If this model doesn't have a best score for this dataset yet or if the current score is better
                if (
                    dataset_type not in best_prompts[model]
                    or score > best_prompts[model][dataset_type]["score"]
                ):
                    best_prompts[model][dataset_type] = {
                        "n": n,
                        "score": score,
                    }

    # Now best_prompts contains the best prompt type for each model and dataset
    return best_prompts


def get_best_n_prompts_for_each_model(input_data, n=3):
    best_prompts = {}

    # Iterate through each model
    for n, datasets in input_data.items():
        for dataset_type, models in datasets.items():
            for model, result in models.items():
                # Initialize the best prompt data structure for this model if not yet created
                if model not in best_prompts:
                    best_prompts[model] = {}

                # Calculate the score for the model with the current prompt type and dataset
                score = eval_model_results(result)

                # Initialize the list of prompts for this model and dataset type if not created
                if dataset_type not in best_prompts[model]:
                    best_prompts[model][dataset_type] = []

                # Append the prompt type and score to the list
                best_prompts[model][dataset_type].append({"n": n, "score": score})

    # Now sort the list of prompts for each model and dataset type and keep the top n
    top_n_prompts = {}
    for model, dataset_dict in best_prompts.items():
        top_n_prompts_for_model = {}
        for dataset_type, prompts in dataset_dict.items():
            # Get the top n prompts by sorting the list based on score (highest score first)
            sorted_prompts = heapq.nlargest(n, prompts, key=lambda x: x["score"])
            top_n_prompts_for_model[dataset_type] = sorted_prompts
        top_n_prompts[model] = top_n_prompts_for_model

    return top_n_prompts


# Load the results
results_wrapper = None
with open(results_file, "rb") as f:
    results_wrapper = pickle.load(f)

# Get the best prompt type for each model
best_prompt_types = get_best_prompt_for_each_model(results_wrapper)
print(best_prompt_types)

# Save the best prompt types
with open("results/dict_best_n_few_shot_for_models.pkl", "wb") as f:
    pickle.dump(best_prompt_types, f)

{'llama3.1:8b': {'sp': {'n': 4, 'score': 78.26086956521739}, 'wp': {'n': 3, 'score': 48.93617021276596}}, 'llama3.2:1b': {'sp': {'n': 3, 'score': 19.148936170212767}, 'wp': {'n': 3, 'score': 21.27659574468085}}, 'llama3.2:3b': {'sp': {'n': 1, 'score': 48.97959183673469}, 'wp': {'n': 1, 'score': 28.57142857142857}}, 'phi3.5:3.8b': {'sp': {'n': 4, 'score': 60.86956521739131}, 'wp': {'n': 3, 'score': 29.78723404255319}}, 'phi4:14b': {'sp': {'n': 4, 'score': 91.30434782608695}, 'wp': {'n': 1, 'score': 69.38775510204081}}, 'qwen2.5:0.5b': {'sp': {'n': 1, 'score': 26.53061224489796}, 'wp': {'n': 4, 'score': 21.73913043478261}}, 'qwen2.5:1.5b': {'sp': {'n': 1, 'score': 57.14285714285714}, 'wp': {'n': 2, 'score': 33.33333333333333}}, 'qwen2.5:3b': {'sp': {'n': 4, 'score': 36.95652173913043}, 'wp': {'n': 4, 'score': 39.130434782608695}}, 'qwen2.5:7b': {'sp': {'n': 3, 'score': 82.97872340425532}, 'wp': {'n': 4, 'score': 50.0}}, 'qwen2.5:14b': {'sp': {'n': 3, 'score': 74.46808510638297}, 'wp': {'