In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
import textwrap

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are a helpful assistant.",
    "step_by_step": "You are a meticulous problem-solver.",
    "creative": "You excel at lateral thinking. Treat this as a riddle.",
    "elimination": "Eliminate wrong options internally.",
    "metaphor": "Interpret keywords metaphorically.",
    "confidence": "Score options internally.",
    "perspective_shift": "Analyze through multiple perspectives silently.",
    "common_sense": "Combine logic and creativity.",
    "assumption_challenge": "Challenge hidden assumptions internally.",
    "pattern_matching": "Find patterns silently.",
    "intuitive": "Critique your intuition internally.",
}


def get_system_prompt(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_user_prompt():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


def create_prompt_template(
    system_prompt_template_name: str = "step_by_step",
):
    system_prompt_template = get_system_prompt(system_prompt_template_name)
    user_prompt_template = get_user_prompt()
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [system_prompt_template, user_prompt_template]
    )

    return chat_prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args

In [5]:
# from tqdm.notebook import tqdm

# from scripts.lmm import OllamaLlm

# models = [
#     OllamaLlm("llama3.1:8b-instruct-q4_K_M"),  # 4.9 GB
#     OllamaLlm("llama3.1:8b-instruct-fp16"),  # 16 GB
#     OllamaLlm("llama3.1:70b-instruct-q2_K"),  # 26 GB
#     OllamaLlm("llama3.2:3b-instruct-q4_K_M"),  # 2 GB
#     OllamaLlm("llama3.2:3b-instruct-fp16"),  # 6.4 GB
#     OllamaLlm("phi3.5:3.8b-mini-instruct-q4_K_M"),  # 2.4 GB
#     OllamaLlm("phi3.5:3.8b-mini-instruct-fp16"),  # 7.6 GB
#     OllamaLlm("phi4:14b-q4_K_M"),  # 9.1 GB
#     OllamaLlm("phi4:14b-fp16"),  # 29 GB
#     OllamaLlm("deepseek-r1:14b-qwen-distill-fp16"),  # 30 GB
#     OllamaLlm("deepseek-r1:32b-qwen-distill-q4_K_M"),  # 20 GB
#     OllamaLlm("qwen2.5:14b-instruct-fp16"),  # 30 GB
#     OllamaLlm("qwen2.5:32b-instruct-q4_K_M"),  # 20 GB
#     OllamaLlm("gemma2:9b-instruct-fp16"),  # 18 GB
#     OllamaLlm("gemma2:27b-instruct-q4_K_M"),  # 17 GB
#     OllamaLlm("mistral-nemo:12b-instruct-2407-q4_K_M"),  # 17 GB
#     OllamaLlm("mistral-nemo:12b-instruct-2407-fp16"),  # 25 GB
# ]

# sample_riddles = dataset.sp_train[:5]

# prompt_type = "default"
# total_results = []
# print(f"\nProcessing prompt type: {prompt_type}")
# prompt_template = create_prompt_template(prompt_type)
# results = []
# for model in models:
#     model_results = []
#     for riddle in tqdm(sample_riddles, desc=model.name):
#         result = model.generate(prompt_template, args_generator(riddle))
#         model_results.append(result)
#     results.append((model.name, model_results))
# total_results.append((prompt_type, results))

In [None]:
import logging

from scripts.lmm import OllamaLlm
from scripts.executor import Executor

# Suppress third-party Langchain HTTP logging (https://github.com/langchain-ai/langchain/issues/14065#issuecomment-2252540350)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

sp_executor = Executor(
    models=[
        # Llama3.1
        OllamaLlm("llama3.1:8b-instruct-q4_K_M"),
        OllamaLlm("llama3.1:8b-instruct-q8_0"),
        OllamaLlm("llama3.1:8b-instruct-fp16"),
        # Llama3.2
        OllamaLlm("llama3.2:1b-instruct-q4_K_M"),
        OllamaLlm("llama3.2:1b-instruct-q8_0"),
        OllamaLlm("llama3.2:1b-instruct-fp16"),
        OllamaLlm("llama3.2:3b-instruct-q4_K_M"),
        OllamaLlm("llama3.2:3b-instruct-q8_0"),
        OllamaLlm("llama3.2:3b-instruct-fp16"),
        # Phi3.5
        OllamaLlm("phi3.5:3.8b-mini-instruct-q4_K_M"),
        OllamaLlm("phi3.5:3.8b-mini-instruct-q8_0"),
        OllamaLlm("phi3.5:3.8b-mini-instruct-fp16"),
        # Phi4
        OllamaLlm("phi4:14b-q4_K_M"),
        OllamaLlm("phi4:14b-q8_0"),
        OllamaLlm("phi4:14b-fp16"),
        # Deepseek R1
        OllamaLlm("deepseek-r1:1.5b-qwen-distill-q4_K_M"),
        OllamaLlm("deepseek-r1:1.5b-qwen-distill-q8_0"),
        OllamaLlm("deepseek-r1:1.5b-qwen-distill-fp16"),
        OllamaLlm("deepseek-r1:8b-qwen-distill-q4_K_M"),
        OllamaLlm("deepseek-r1:8b-qwen-distill-q8_0"),
        OllamaLlm("deepseek-r1:8b-qwen-distill-fp16"),
        OllamaLlm("deepseek-r1:14b-qwen-distill-q4_K_M"),
        OllamaLlm("deepseek-r1:14b-qwen-distill-q8_0"),
        OllamaLlm("deepseek-r1:14b-qwen-distill-fp16"),
        OllamaLlm("deepseek-r1:32b-qwen-distill-q4_K_M"),
        # Qwen2.5
        OllamaLlm("qwen2.5:0.5b-instruct-q4_K_M"),
        OllamaLlm("qwen2.5:0.5b-instruct-q8_0"),
        OllamaLlm("qwen2.5:0.5b-instruct-fp16"),
        OllamaLlm("qwen2.5:1.5b-instruct-q4_K_M"),
        OllamaLlm("qwen2.5:1.5b-instruct-q8_0"),
        OllamaLlm("qwen2.5:1.5b-instruct-fp16"),
        OllamaLlm("qwen2.5:3b-instruct-q4_K_M"),
        OllamaLlm("qwen2.5:3b-instruct-q8_0"),
        OllamaLlm("qwen2.5:3b-instruct-fp16"),
        OllamaLlm("qwen2.5:7b-instruct-q4_K_M"),
        OllamaLlm("qwen2.5:7b-instruct-q8_0"),
        OllamaLlm("qwen2.5:7b-instruct-fp16"),
        OllamaLlm("qwen2.5:14b-instruct-q4_K_M"),
        OllamaLlm("qwen2.5:14b-instruct-q8_0"),
        OllamaLlm("qwen2.5:14b-instruct-fp16"),
        OllamaLlm("qwen2.5:32b-instruct-q4_K_M"),
        # Gemma2
        OllamaLlm("gemma2:2b-instruct-q4_K_M"),
        OllamaLlm("gemma2:2b-instruct-q8_0"),
        OllamaLlm("gemma2:2b-instruct-fp16"),
        OllamaLlm("gemma2:9b-instruct-q4_K_M"),
        OllamaLlm("gemma2:9b-instruct-q8_0"),
        OllamaLlm("gemma2:9b-instruct-fp16"),
        OllamaLlm("gemma2:27b-instruct-q4_K_M"),
        OllamaLlm("gemma2:27b-instruct-q8_0"),
        # Mistral Nemo
        OllamaLlm("mistral-nemo:12b-instruct-2407-q4_K_M"),
        OllamaLlm("mistral-nemo:12b-instruct-2407-q8_0"),
        OllamaLlm("mistral-nemo:12b-instruct-2407-fp16"),
    ],
    riddle_dataset=dataset.sp,
)

chat_prompt_template = create_prompt_template("default")
sp_results = sp_executor.execute(
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    file_name="sp_results_naive.pkl",
)

2025-02-13 15:14:11,338 - INFO - Initialized executor with 13 models and 627 riddles
2025-02-13 15:14:11,338 - INFO - Starting execution
2025-02-13 15:14:11,339 - INFO - Processing qwen2.5:14b-instruct-fp16
2025-02-13 15:14:11,339 - INFO - Pulling Ollama model: qwen2.5:14b-instruct-fp16


KeyboardInterrupt: 

In [7]:
import logging

from scripts.lmm import OllamaLlm
from scripts.executor import Executor

# Suppress third-party Langchain HTTP logging (https://github.com/langchain-ai/langchain/issues/14065#issuecomment-2252540350)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

sp_executor = Executor(
    models=[
        OllamaLlm("llama3.1:8b-instruct-q4_K_M"),  # 4.9 GB -> 6.43 GB (after loading)
        OllamaLlm("llama3.1:8b-instruct-fp16"),  # 16 GB
        OllamaLlm("llama3.1:70b-instruct-q2_K"),  # 26 GB
        OllamaLlm("llama3.2:3b-instruct-q4_K_M"),  # 2 GB
        OllamaLlm("llama3.2:3b-instruct-fp16"),  # 6.4 GB
        OllamaLlm("phi3.5:3.8b-mini-instruct-q4_K_M"),  # 2.4 GB
        OllamaLlm("phi3.5:3.8b-mini-instruct-fp16"),  # 7.6 GB
        OllamaLlm("phi4:14b-q4_K_M"),  # 9.1 GB
        OllamaLlm("phi4:14b-fp16"),  # 29 GB
        OllamaLlm("deepseek-r1:14b-qwen-distill-fp16"),  # 30 GB
        OllamaLlm("deepseek-r1:32b-qwen-distill-q4_K_M"),  # 20 GB
        OllamaLlm("qwen2.5:14b-instruct-fp16"),  # 30 GB
        OllamaLlm("qwen2.5:32b-instruct-q4_K_M"),  # 20 GB
        OllamaLlm("gemma2:9b-instruct-fp16"),  # 18 GB
        OllamaLlm("gemma2:27b-instruct-q4_K_M"),  # 17 GB
        OllamaLlm("mistral-nemo:12b-instruct-2407-q4_K_M"),  # 17 GB
        OllamaLlm("mistral-nemo:12b-instruct-2407-fp16"),  # 25 GB
    ],
    riddle_dataset=dataset.sp,
)

chat_prompt_template = create_prompt_template("default")
sp_results = sp_executor.execute(
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    file_name="sp_results_naive.pkl",
)

2025-02-13 12:48:37,726 - INFO - Initialized executor with 17 models and 627 riddles
2025-02-13 12:48:37,727 - INFO - Starting execution
2025-02-13 12:48:37,727 - INFO - Processing llama3.1:8b-instruct-q4_K_M
2025-02-13 12:48:37,727 - INFO - Pulling Ollama model: llama3.1:8b-instruct-q4_K_M


llama3.1:8b-instruct-q4_K_M:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-13 12:51:11,226 - INFO - Dumping results to results/checkpoints/llama3.1:8b-instruct-q4_K_M_sp_results_naive.pkl.pkl
2025-02-13 12:51:11,353 - INFO - Cleaning up llama3.1:8b-instruct-q4_K_M
2025-02-13 12:51:11,354 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-13 12:51:11,354 - INFO - Processing llama3.1:8b-instruct-fp16
2025-02-13 12:51:11,354 - INFO - Pulling Ollama model: llama3.1:8b-instruct-fp16


llama3.1:8b-instruct-fp16:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-13 12:53:48,048 - INFO - Dumping results to results/checkpoints/llama3.1:8b-instruct-fp16_sp_results_naive.pkl.pkl
2025-02-13 12:53:48,279 - INFO - Cleaning up llama3.1:8b-instruct-fp16
2025-02-13 12:53:48,279 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-13 12:53:48,280 - INFO - Processing llama3.1:70b-instruct-q2_K
2025-02-13 12:53:48,280 - INFO - Pulling Ollama model: llama3.1:70b-instruct-q2_K


llama3.1:70b-instruct-q2_K:   0%|          | 0/627 [00:00<?, ?it/s]

2025-02-13 13:16:15,935 - INFO - Cleaning up llama3.1:70b-instruct-q2_K
2025-02-13 13:16:15,936 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!


KeyboardInterrupt: 