In [1]:
from scripts import setup_environment

setup_environment()

In [None]:
import textwrap

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are an AI assistant.",
    "default_improved": "You are an AI assistant specialized in solving lateral thinking questions. You will receive a question with multiple choices and must determine the correct answer using logical reasoning and problem-solving techniques.",
    "step_by_step": "You are a logical problem solver. Break down the question systematically, analyze each answer choice step by step, eliminate incorrect options, and select the best answer.",
    "creative": "You are a lateral thinker. Approach each question with flexible reasoning, exploring unconventional yet valid interpretations before selecting the best answer.",
    "elimination": "You are a strategic reasoner. First, identify and eliminate incorrect answer choices. Then, select the most logical remaining option.",
    "metaphor": "You are skilled in abstract reasoning. Consider both literal and metaphorical meanings in the question and choices before selecting the most insightful answer.",
    "confidence": "You are an analytical decision-maker. Assess the likelihood of correctness for each choice, score them internally, and select the answer with the highest confidence.",
    "perspective_shift": "You are a multi-perspective analyst. Evaluate the question from different angles, considering alternative interpretations before determining the best answer.",
    "common_sense": "You balance logic and practicality. Apply both structured reasoning and real-world common sense to determine the most reasonable answer.",
    "assumption_challenge": "You are a critical thinker. Identify and question hidden assumptions in the question and choices before selecting the answer that best challenges or aligns with them.",
    "pattern_matching": "You recognize patterns and relationships. Identify logical structures, recurring themes, or hidden connections in the question and choices before selecting the best answer.",
    "intuitive": "You combine intuition with logic. Generate an initial answer instinctively, then critically evaluate it for logical soundness before finalizing your choice.",
}


def get_system_prompt(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_user_prompt():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


def create_prompt_template(
    system_prompt_template_name: str = "default",
):
    system_prompt_template = get_system_prompt(system_prompt_template_name)
    user_prompt_template = get_user_prompt()
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [system_prompt_template, user_prompt_template]
    )

    return chat_prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [None]:
from scripts.lmm import OllamaModel
from scripts.executor import Executor

base_url = "http://108.179.129.43:31701"
quanzization = "q8_0"
executor = Executor(
    models=[
        # Llama3.1
        OllamaModel(f"llama3.1:8b-instruct-{quanzization}", base_url=base_url),
        # Llama3.2
        OllamaModel(f"llama3.2:1b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"llama3.2:3b-instruct-{quanzization}", base_url=base_url),
        # Phi3.5
        OllamaModel(f"phi3.5:3.8b-mini-instruct-{quanzization}", base_url=base_url),
        # Phi4
        OllamaModel(f"phi4:14b-{quanzization}", base_url=base_url),
        # Qwen2.5
        OllamaModel(f"qwen2.5:0.5b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"qwen2.5:1.5b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"qwen2.5:3b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"qwen2.5:7b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"qwen2.5:14b-instruct-{quanzization}", base_url=base_url),
        OllamaModel("qwen2.5:32b-instruct-q5_K_M", base_url=base_url),
        # Gemma2
        OllamaModel(f"gemma2:2b-instruct-{quanzization}", base_url=base_url),
        OllamaModel(f"gemma2:9b-instruct-{quanzization}", base_url=base_url),
        OllamaModel("gemma2:27b-instruct-q6_K", base_url=base_url),
        # Mistral Nemo
        OllamaModel(
            f"mistral-nemo:12b-instruct-2407-{quanzization}", base_url=base_url
        ),
    ]
)

2025-02-18 08:18:55,406 - INFO - Initialized executor with 15 models.


In [7]:
sp_results = await executor.aexecute(
    dataset.sp,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    result_file_name="sp_results_naive_q8",
)

2025-02-18 08:18:55,461 - INFO - Loading results from result file: results/sp_results_naive_q8.pkl
2025-02-18 08:18:55,954 - INFO - Results file is valid, returning results


In [8]:
wp_results = await executor.aexecute(
    dataset.wp,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    result_file_name="wp_results_naive_q8",
)

2025-02-18 08:18:55,959 - INFO - Starting asynchronous execution
2025-02-18 08:18:55,960 - INFO - Split dataset of 492 items into 99 batches of size 5
2025-02-18 08:18:55,960 - INFO - Processing llama3.1:8b-instruct-q8_0
2025-02-18 08:18:55,960 - INFO - Pulling Ollama model: llama3.1:8b-instruct-q8_0


llama3.1:8b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:21:14,135 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/llama3.1:8b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:21:14,234 - INFO - Cleaning up llama3.1:8b-instruct-q8_0
2025-02-18 08:21:14,235 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:21:14,235 - INFO - Processing llama3.2:1b-instruct-q8_0
2025-02-18 08:21:14,236 - INFO - Pulling Ollama model: llama3.2:1b-instruct-q8_0


llama3.2:1b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:22:03,226 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/llama3.2:1b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:22:03,517 - INFO - Cleaning up llama3.2:1b-instruct-q8_0
2025-02-18 08:22:03,518 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:22:03,518 - INFO - Processing llama3.2:3b-instruct-q8_0
2025-02-18 08:22:03,518 - INFO - Pulling Ollama model: llama3.2:3b-instruct-q8_0


llama3.2:3b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:23:17,009 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/llama3.2:3b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:23:17,113 - INFO - Cleaning up llama3.2:3b-instruct-q8_0
2025-02-18 08:23:17,113 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:23:17,113 - INFO - Processing phi3.5:3.8b-mini-instruct-q8_0
2025-02-18 08:23:17,114 - INFO - Pulling Ollama model: phi3.5:3.8b-mini-instruct-q8_0


phi3.5:3.8b-mini-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:31:02,862 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/phi3.5:3.8b-mini-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:31:02,968 - INFO - Cleaning up phi3.5:3.8b-mini-instruct-q8_0
2025-02-18 08:31:02,968 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:31:02,968 - INFO - Processing phi4:14b-q8_0
2025-02-18 08:31:02,969 - INFO - Pulling Ollama model: phi4:14b-q8_0


phi4:14b-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:49:17,459 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/phi4:14b-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:49:17,557 - INFO - Cleaning up phi4:14b-q8_0
2025-02-18 08:49:17,557 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:49:17,558 - INFO - Processing qwen2.5:0.5b-instruct-q8_0
2025-02-18 08:49:17,558 - INFO - Pulling Ollama model: qwen2.5:0.5b-instruct-q8_0


qwen2.5:0.5b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:50:05,930 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:0.5b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:50:06,030 - INFO - Cleaning up qwen2.5:0.5b-instruct-q8_0
2025-02-18 08:50:06,030 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:50:06,031 - INFO - Processing qwen2.5:1.5b-instruct-q8_0
2025-02-18 08:50:06,031 - INFO - Pulling Ollama model: qwen2.5:1.5b-instruct-q8_0


qwen2.5:1.5b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:51:01,175 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:1.5b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:51:01,269 - INFO - Cleaning up qwen2.5:1.5b-instruct-q8_0
2025-02-18 08:51:01,269 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:51:01,270 - INFO - Processing qwen2.5:3b-instruct-q8_0
2025-02-18 08:51:01,270 - INFO - Pulling Ollama model: qwen2.5:3b-instruct-q8_0


qwen2.5:3b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:52:20,335 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:3b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:52:20,440 - INFO - Cleaning up qwen2.5:3b-instruct-q8_0
2025-02-18 08:52:20,441 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:52:20,441 - INFO - Processing qwen2.5:7b-instruct-q8_0
2025-02-18 08:52:20,441 - INFO - Pulling Ollama model: qwen2.5:7b-instruct-q8_0


qwen2.5:7b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 08:54:39,526 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:7b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 08:54:39,919 - INFO - Cleaning up qwen2.5:7b-instruct-q8_0
2025-02-18 08:54:39,920 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 08:54:39,920 - INFO - Processing qwen2.5:14b-instruct-q8_0
2025-02-18 08:54:39,920 - INFO - Pulling Ollama model: qwen2.5:14b-instruct-q8_0


qwen2.5:14b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:01:55,374 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:14b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 09:01:55,592 - INFO - Cleaning up qwen2.5:14b-instruct-q8_0
2025-02-18 09:01:55,594 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:01:55,595 - INFO - Processing qwen2.5:32b-instruct-q5_K_M
2025-02-18 09:01:55,596 - INFO - Pulling Ollama model: qwen2.5:32b-instruct-q5_K_M


qwen2.5:32b-instruct-q5_K_M:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:18:50,186 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/qwen2.5:32b-instruct-q5_K_M_wp_results_naive_q8.pkl
2025-02-18 09:18:50,292 - INFO - Cleaning up qwen2.5:32b-instruct-q5_K_M
2025-02-18 09:18:50,293 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:18:50,293 - INFO - Processing gemma2:2b-instruct-q8_0
2025-02-18 09:18:50,293 - INFO - Pulling Ollama model: gemma2:2b-instruct-q8_0


gemma2:2b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:20:11,277 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/gemma2:2b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 09:20:11,374 - INFO - Cleaning up gemma2:2b-instruct-q8_0
2025-02-18 09:20:11,374 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:20:11,375 - INFO - Processing gemma2:9b-instruct-q8_0
2025-02-18 09:20:11,375 - INFO - Pulling Ollama model: gemma2:9b-instruct-q8_0


gemma2:9b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:23:28,290 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/gemma2:9b-instruct-q8_0_wp_results_naive_q8.pkl
2025-02-18 09:23:28,650 - INFO - Cleaning up gemma2:9b-instruct-q8_0
2025-02-18 09:23:28,651 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:23:28,651 - INFO - Processing gemma2:27b-instruct-q6_K
2025-02-18 09:23:28,652 - INFO - Pulling Ollama model: gemma2:27b-instruct-q6_K


gemma2:27b-instruct-q6_K:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:34:19,384 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/gemma2:27b-instruct-q6_K_wp_results_naive_q8.pkl
2025-02-18 09:34:19,486 - INFO - Cleaning up gemma2:27b-instruct-q6_K
2025-02-18 09:34:19,487 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:34:19,487 - INFO - Processing mistral-nemo:12b-instruct-2407-q8_0
2025-02-18 09:34:19,488 - INFO - Pulling Ollama model: mistral-nemo:12b-instruct-2407-q8_0
2025-02-18 09:35:33,166 - ERROR - Error pulling Ollama model: write /root/.ollama/models/blobs/sha256-824229be17606dd8177fc91c1d330b065bc4f3de2873eab614376b988dcbf48a-partial: no space left on device (status code: 500)
2025-02-18 09:35:33,167 - INFO - Deleting all ollama models to free up space


mistral-nemo:12b-instruct-2407-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 09:38:44,648 - INFO - Creating checkpoint: results/checkpoints/wp_results_naive_q8/mistral-nemo:12b-instruct-2407-q8_0_wp_results_naive_q8.pkl
2025-02-18 09:38:44,747 - INFO - Cleaning up mistral-nemo:12b-instruct-2407-q8_0
2025-02-18 09:38:44,747 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-18 09:38:44,748 - INFO - Asynchronous execution complete
2025-02-18 09:38:44,748 - INFO - Dumping results to results/wp_results_naive_q8.pkl
