In [1]:
from scripts import setup_environment

setup_environment()

In [None]:
import textwrap

import dill as pickle
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are an AI assistant.",
    "default_improved": "You are an AI assistant specialized in solving lateral thinking questions. You will receive a question with multiple choices and must determine the correct answer using logical reasoning and problem-solving techniques.",
    "step_by_step": "You are a logical problem solver. Break down the question systematically, analyze each answer choice step by step, eliminate incorrect options, and select the best answer.",
    "creative": "You are a lateral thinker. Approach each question with flexible reasoning, exploring unconventional yet valid interpretations before selecting the best answer.",
    "elimination": "You are a strategic reasoner. First, identify and eliminate incorrect answer choices. Then, select the most logical remaining option.",
    "metaphor": "You are skilled in abstract reasoning. Consider both literal and metaphorical meanings in the question and choices before selecting the most insightful answer.",
    "confidence": "You are an analytical decision-maker. Assess the likelihood of correctness for each choice, score them internally, and select the answer with the highest confidence.",
    "perspective_shift": "You are a multi-perspective analyst. Evaluate the question from different angles, considering alternative interpretations before determining the best answer.",
    "common_sense": "You balance logic and practicality. Apply both structured reasoning and real-world common sense to determine the most reasonable answer.",
    "assumption_challenge": "You are a critical thinker. Identify and question hidden assumptions in the question and choices before selecting the answer that best challenges or aligns with them.",
    "pattern_matching": "You recognize patterns and relationships. Identify logical structures, recurring themes, or hidden connections in the question and choices before selecting the best answer.",
    "intuitive": "You combine intuition with logic. Generate an initial answer instinctively, then critically evaluate it for logical soundness before finalizing your choice.",
}


def get_system_prompt(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_user_prompt():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


best_prompt_types = None
with open("results/dict_best_system_prompt_for_models.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)


def create_prompt_template_by_model(
    model_name: str,
    dataset_name: str,
):
    best_system_template_name = best_prompt_types[model_name][dataset_name][
        "prompt_type"
    ]
    system_prompt_template = get_system_prompt(best_system_template_name)
    user_prompt_template = get_user_prompt()
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [system_prompt_template, user_prompt_template]
    )

    return chat_prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args

In [None]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://108.179.129.43:31701"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b-instruct-q8_0"),  # => 9 GB
        # Llama3.2
        model_builder.build_model("llama3.2:1b-instruct-fp16"),  # => 2.5 GB
        model_builder.build_model("llama3.2:3b-instruct-fp16"),  # => 6.4 GB
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b-mini-instruct-fp16"),  # => 7.6 GB
        # Phi4
        model_builder.build_model("phi4:14b-q4_K_M"),  # => 9.1 GB
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b-instruct-fp16"),  # => 1 GB
        model_builder.build_model("qwen2.5:1.5b-instruct-fp16"),  # => 3.1 GB
        model_builder.build_model("qwen2.5:3b-instruct-fp16"),  # => 6.2 GB
        model_builder.build_model("qwen2.5:7b-instruct-q8_0"),  # => 8.1 GB
        model_builder.build_model("qwen2.5:14b-instruct-q4_K_M"),  # => 9 GB
        model_builder.build_model("qwen2.5:32b-instruct-q4_K_M"),  # => 20 GB
        # Gemma2
        model_builder.build_model("gemma2:2b-instruct-fp16"),  # => 5.2 GB
        model_builder.build_model(
            "gemma2:9b-instruct-q8_0",
        ),  # => 9.8 GB
        model_builder.build_model("gemma2:27b-instruct-q4_K_M"),  # => 22 GB
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b-instruct-2407-q4_K_M"),  # => 7.5 GB
    ]
)

2025-02-18 14:07:16,499 - INFO - Initialized executor with 15 models.


In [6]:
def get_prompt_template(model_name: str, dataset_name: str):
    # Split name after the b paramer, e.g., llama3.1:8b-instruct-fp16 => llama3.1:8b
    model_name = model_name[0 : model_name.index("b-") + 1]
    chat_prompt_template = create_prompt_template_by_model(model_name, dataset_name)
    return chat_prompt_template


sp_results = await executor.aexecute(
    dataset.sp,
    lambda x: get_prompt_template(x, "sp"),
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    result_file_name="sp_results_best_templates",
)

2025-02-18 14:07:17,019 - INFO - Restored results from results file results/sp_results_best_templates.pkl, skipping execution for this model!


In [7]:
wp_results = await executor.aexecute(
    dataset.wp,
    lambda x: get_prompt_template(x, "wp"),
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    result_file_name="wp_results_best_templates",
)

2025-02-18 14:07:17,047 - INFO - Split dataset of 492 items into 99 batches of size 5
2025-02-18 14:07:17,058 - INFO - Restored results from checkpoint for model llama3.1:8b-instruct-q8_0, skipping execution for this model!
2025-02-18 14:07:17,067 - INFO - Restored results from checkpoint for model llama3.2:1b-instruct-fp16, skipping execution for this model!
2025-02-18 14:07:17,079 - INFO - Restored results from checkpoint for model llama3.2:3b-instruct-fp16, skipping execution for this model!
2025-02-18 14:07:17,090 - INFO - Restored results from checkpoint for model phi3.5:3.8b-mini-instruct-fp16, skipping execution for this model!
2025-02-18 14:07:17,099 - INFO - Restored results from checkpoint for model phi4:14b-q4_K_M, skipping execution for this model!
2025-02-18 14:07:17,314 - INFO - Restored results from checkpoint for model qwen2.5:0.5b-instruct-fp16, skipping execution for this model!


qwen2.5:1.5b-instruct-fp16:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:17:07,821 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/qwen2.5:1.5b-instruct-fp16_wp_results_best_templates.pkl


qwen2.5:3b-instruct-fp16:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:28:18,954 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/qwen2.5:3b-instruct-fp16_wp_results_best_templates.pkl


qwen2.5:7b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:36:09,067 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/qwen2.5:7b-instruct-q8_0_wp_results_best_templates.pkl


qwen2.5:14b-instruct-q4_K_M:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:44:41,159 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/qwen2.5:14b-instruct-q4_K_M_wp_results_best_templates.pkl


qwen2.5:32b-instruct-q4_K_M:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:56:48,904 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/qwen2.5:32b-instruct-q4_K_M_wp_results_best_templates.pkl


gemma2:2b-instruct-fp16:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 14:58:29,796 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/gemma2:2b-instruct-fp16_wp_results_best_templates.pkl


gemma2:9b-instruct-q8_0:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 15:01:04,071 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/gemma2:9b-instruct-q8_0_wp_results_best_templates.pkl


gemma2:27b-instruct-q4_K_M:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 15:08:20,618 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/gemma2:27b-instruct-q4_K_M_wp_results_best_templates.pkl


mistral-nemo:12b-instruct-2407-q4_K_M:   0%|          | 0/492 [00:00<?, ?it/s]

2025-02-18 15:11:31,003 - INFO - Creating checkpoint: results/checkpoints/wp_results_best_templates/mistral-nemo:12b-instruct-2407-q4_K_M_wp_results_best_templates.pkl
2025-02-18 15:11:31,101 - INFO - Dumping results to results/wp_results_best_templates.pkl
