In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
import textwrap

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are a helpful assistant.",
    "step_by_step": "You are a meticulous problem-solver.",
    "creative": "You excel at lateral thinking. Treat this as a riddle.",
    "elimination": "Eliminate wrong options internally.",
    "metaphor": "Interpret keywords metaphorically.",
    "confidence": "Score options internally.",
    "perspective_shift": "Analyze through multiple perspectives silently.",
    "common_sense": "Combine logic and creativity.",
    "assumption_challenge": "Challenge hidden assumptions internally.",
    "pattern_matching": "Find patterns silently.",
    "intuitive": "Critique your intuition internally.",
}


def get_system_prompt(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_user_prompt():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


def create_prompt_template(
    system_prompt_template_name: str = "default",
):
    system_prompt_template = get_system_prompt(system_prompt_template_name)
    user_prompt_template = get_user_prompt()
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [system_prompt_template, user_prompt_template]
    )

    return chat_prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [5]:
from scripts.lmm import OllamaModel
from scripts.executor import Executor

executor = Executor(
    models=[
        # Llama3.1
        OllamaModel("llama3.1:8b"),
        # Llama3.2
        OllamaModel("llama3.2:1b"),
        OllamaModel("llama3.2:3b"),
        # Phi3.5
        OllamaModel("phi3.5:3.8b"),
        # Phi4
        OllamaModel("phi4:14b"),
        # Qwen2.5
        OllamaModel("qwen2.5:0.5b"),
        OllamaModel("qwen2.5:1.5b"),
        OllamaModel("qwen2.5:3b"),
        OllamaModel("qwen2.5:7b"),
        OllamaModel("qwen2.5:14b"),
        OllamaModel("qwen2.5:32b"),
        # Gemma2
        OllamaModel("gemma2:2b"),
        OllamaModel("gemma2:9b"),
        OllamaModel("gemma2:27b"),
        # Mistral Nemo
        OllamaModel("mistral-nemo:12b"),
    ]
)

2025-02-15 11:59:38,110 - INFO - Initialized executor with 15 models.


In [6]:
sp_results = await executor.aexecute(
    dataset.sp,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    result_file_name="sp_results_naive",
)

2025-02-15 11:59:38,127 - INFO - Starting asynchronous execution
2025-02-15 11:59:38,127 - INFO - Split dataset of 627 items into 126 batches of size 5
2025-02-15 11:59:38,128 - INFO - Processing llama3.1:8b
2025-02-15 11:59:38,128 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:00:22,964 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/llama3.1:8b_sp_results_naive.pkl
2025-02-15 12:00:23,086 - INFO - Cleaning up llama3.1:8b
2025-02-15 12:00:23,087 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:00:23,087 - INFO - Processing llama3.2:1b
2025-02-15 12:00:23,087 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:00:48,034 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/llama3.2:1b_sp_results_naive.pkl
2025-02-15 12:00:48,162 - INFO - Cleaning up llama3.2:1b
2025-02-15 12:00:48,163 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:00:48,163 - INFO - Processing llama3.2:3b
2025-02-15 12:00:48,163 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:01:25,428 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/llama3.2:3b_sp_results_naive.pkl
2025-02-15 12:01:25,636 - INFO - Cleaning up llama3.2:3b
2025-02-15 12:01:25,636 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:01:25,636 - INFO - Processing phi3.5:3.8b
2025-02-15 12:01:25,636 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:10:17,950 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/phi3.5:3.8b_sp_results_naive.pkl
2025-02-15 12:10:18,173 - INFO - Cleaning up phi3.5:3.8b
2025-02-15 12:10:18,174 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:10:18,174 - INFO - Processing phi4:14b
2025-02-15 12:10:18,174 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:17:45,457 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/phi4:14b_sp_results_naive.pkl
2025-02-15 12:17:45,587 - INFO - Cleaning up phi4:14b
2025-02-15 12:17:45,588 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:17:45,588 - INFO - Processing qwen2.5:0.5b
2025-02-15 12:17:45,589 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:19:11,344 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/qwen2.5:0.5b_sp_results_naive.pkl
2025-02-15 12:19:11,472 - INFO - Cleaning up qwen2.5:0.5b
2025-02-15 12:19:11,472 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:19:11,473 - INFO - Processing qwen2.5:1.5b
2025-02-15 12:19:11,473 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:19:58,831 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/qwen2.5:1.5b_sp_results_naive.pkl
2025-02-15 12:19:58,955 - INFO - Cleaning up qwen2.5:1.5b
2025-02-15 12:19:58,955 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:19:58,956 - INFO - Processing qwen2.5:3b
2025-02-15 12:19:58,956 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:20:38,336 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/qwen2.5:3b_sp_results_naive.pkl
2025-02-15 12:20:38,467 - INFO - Cleaning up qwen2.5:3b
2025-02-15 12:20:38,467 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:20:38,468 - INFO - Processing qwen2.5:7b
2025-02-15 12:20:38,468 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:21:51,863 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/qwen2.5:7b_sp_results_naive.pkl
2025-02-15 12:21:52,118 - INFO - Cleaning up qwen2.5:7b
2025-02-15 12:21:52,118 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:21:52,118 - INFO - Processing qwen2.5:14b
2025-02-15 12:21:52,118 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:25:36,433 - INFO - Creating checkpoint: results/checkpoints/sp_results_naive/qwen2.5:14b_sp_results_naive.pkl
2025-02-15 12:25:36,566 - INFO - Cleaning up qwen2.5:14b
2025-02-15 12:25:36,566 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 12:25:36,566 - INFO - Processing qwen2.5:32b
2025-02-15 12:25:36,567 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-15 12:28:42,094 - INFO - Cleaning up qwen2.5:32b
2025-02-15 12:28:42,095 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!


RemoteProtocolError: Server disconnected without sending a response.

In [None]:
wp_results = await executor.aexecute(
    dataset.wp,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    result_file_name="wp_results_naive",
)

  wp_results = executor.aexecute(
