In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
import textwrap

from langchain.prompts.chat import (
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are a helpful assistant.",
    "step_by_step": "You are a meticulous problem-solver.",
    "creative": "You excel at lateral thinking. Treat this as a riddle.",
    "elimination": "Eliminate wrong options internally.",
    "metaphor": "Interpret keywords metaphorically.",
    "confidence": "Score options internally.",
    "perspective_shift": "Analyze through multiple perspectives silently.",
    "common_sense": "Combine logic and creativity.",
    "assumption_challenge": "Challenge hidden assumptions internally.",
    "pattern_matching": "Find patterns silently.",
    "intuitive": "Critique your intuition internally.",
}


def get_system_prompt_template(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_humand_prompt_template():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": string.ascii_uppercase[riddle_question.label],
    }

    return template_args

In [5]:
# We now create the few shot exampel but following the best practices from https://python.langchain.com/docs/how_to/few_shot_examples_chat/
# Thus we do not provide the examples in the initial client prompt but as a message history of the user asking and the system answering

from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate


def get_few_shot_chat_template(
    dataset: list[RiddleQuestion],
    number_of_shots: int = 4,
    system_template: str = "default",
):
    riddles_as_examples = dataset[:number_of_shots]
    riddles_to_solve = dataset[number_of_shots:]
    example_prompt = ChatPromptTemplate.from_messages(
        [
            get_humand_prompt_template(),
            ("ai", "{answer}"),
        ]
    )
    few_shot_prompt_naive = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=[args_generator(example) for example in riddles_as_examples],
    )

    chat_prompt_template = ChatPromptTemplate.from_messages(
        [
            get_system_prompt_template(system_template),
            few_shot_prompt_naive,
            get_humand_prompt_template(),
        ]
    )
    return (chat_prompt_template, riddles_to_solve)

In [6]:
# from langchain_chroma import Chroma
# from langchain_ollama import OllamaEmbeddings
# from langchain_core.example_selectors import SemanticSimilarityExampleSelector

# riddles = dataset.sp
# examples_full = [args_generator(riddle) for riddle in riddles]
# example_selector = SemanticSimilarityExampleSelector.from_examples(
#     examples,
#     OllamaEmbeddings(
#         model="bge-m3"
#     ),  # bge-m3 excel at handling context-rich queries due to higher embedding dimensions, as fallback we could also use models like  nomic-embed-text for short semantic queries
#     Chroma,
#     k=4,
# )


In [7]:
from scripts.lmm import OllamaModel
from scripts.executor import Executor

executor = Executor(
    models=[
        # Llama3.1
        OllamaModel("llama3.1:8b"),
        # Llama3.2
        OllamaModel("llama3.2:1b"),
        OllamaModel("llama3.2:3b"),
        # Phi3.5
        OllamaModel("phi3.5:3.8b"),
        # Phi4
        OllamaModel("phi4:14b"),
        # Qwen2.5
        OllamaModel("qwen2.5:0.5b"),
        OllamaModel("qwen2.5:1.5b"),
        OllamaModel("qwen2.5:3b"),
        OllamaModel("qwen2.5:7b"),
        OllamaModel("qwen2.5:14b"),
        OllamaModel("qwen2.5:32b"),
        # Gemma2
        OllamaModel("gemma2:2b"),
        OllamaModel("gemma2:9b"),
        OllamaModel("gemma2:27b"),
        # Mistral Nemo
        OllamaModel("mistral-nemo:12b"),
    ]
)

2025-02-15 17:27:19,791 - INFO - Initialized executor with 15 models.


In [8]:
n_shots = 4
chat_prompt_template, riddles_for_eval = get_few_shot_chat_template(dataset.sp, n_shots)

sp_results = await executor.aexecute(
    riddles_for_eval,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    result_file_name=f"sp_results_few_shot_n_{n_shots}",
)

2025-02-15 17:27:19,805 - INFO - Starting asynchronous execution
2025-02-15 17:27:19,805 - INFO - Split dataset of 623 items into 125 batches of size 5
2025-02-15 17:27:19,806 - INFO - Processing llama3.1:8b
2025-02-15 17:27:19,806 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:28:24,091 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/llama3.1:8b_sp_results_few_shot_n_4.pkl
2025-02-15 17:28:24,549 - INFO - Cleaning up llama3.1:8b
2025-02-15 17:28:24,550 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:28:24,551 - INFO - Processing llama3.2:1b
2025-02-15 17:28:24,551 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:29:03,619 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/llama3.2:1b_sp_results_few_shot_n_4.pkl
2025-02-15 17:29:04,043 - INFO - Cleaning up llama3.2:1b
2025-02-15 17:29:04,044 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:29:04,044 - INFO - Processing llama3.2:3b
2025-02-15 17:29:04,044 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:29:46,495 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/llama3.2:3b_sp_results_few_shot_n_4.pkl
2025-02-15 17:29:46,901 - INFO - Cleaning up llama3.2:3b
2025-02-15 17:29:46,901 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:29:46,902 - INFO - Processing phi3.5:3.8b
2025-02-15 17:29:46,902 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:37:25,461 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/phi3.5:3.8b_sp_results_few_shot_n_4.pkl
2025-02-15 17:37:25,923 - INFO - Cleaning up phi3.5:3.8b
2025-02-15 17:37:25,924 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:37:25,924 - INFO - Processing phi4:14b
2025-02-15 17:37:25,925 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:39:24,300 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/phi4:14b_sp_results_few_shot_n_4.pkl
2025-02-15 17:39:24,798 - INFO - Cleaning up phi4:14b
2025-02-15 17:39:24,799 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:39:24,799 - INFO - Processing qwen2.5:0.5b
2025-02-15 17:39:24,800 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:41:21,925 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:0.5b_sp_results_few_shot_n_4.pkl
2025-02-15 17:41:22,442 - INFO - Cleaning up qwen2.5:0.5b
2025-02-15 17:41:22,443 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:41:22,443 - INFO - Processing qwen2.5:1.5b
2025-02-15 17:41:22,443 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:43:21,561 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:1.5b_sp_results_few_shot_n_4.pkl
2025-02-15 17:43:22,063 - INFO - Cleaning up qwen2.5:1.5b
2025-02-15 17:43:22,064 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:43:22,064 - INFO - Processing qwen2.5:3b
2025-02-15 17:43:22,064 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:45:25,099 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:3b_sp_results_few_shot_n_4.pkl
2025-02-15 17:45:25,632 - INFO - Cleaning up qwen2.5:3b
2025-02-15 17:45:25,633 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:45:25,633 - INFO - Processing qwen2.5:7b
2025-02-15 17:45:25,633 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:47:50,684 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:7b_sp_results_few_shot_n_4.pkl
2025-02-15 17:47:51,010 - INFO - Cleaning up qwen2.5:7b
2025-02-15 17:47:51,010 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:47:51,010 - INFO - Processing qwen2.5:14b
2025-02-15 17:47:51,010 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:50:36,282 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:14b_sp_results_few_shot_n_4.pkl
2025-02-15 17:50:36,582 - INFO - Cleaning up qwen2.5:14b
2025-02-15 17:50:36,583 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:50:36,583 - INFO - Processing qwen2.5:32b
2025-02-15 17:50:36,583 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:54:27,828 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/qwen2.5:32b_sp_results_few_shot_n_4.pkl
2025-02-15 17:54:28,330 - INFO - Cleaning up qwen2.5:32b
2025-02-15 17:54:28,331 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:54:28,331 - INFO - Processing gemma2:2b
2025-02-15 17:54:28,331 - INFO - Pulling Ollama model: gemma2:2b


gemma2:2b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:55:26,938 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/gemma2:2b_sp_results_few_shot_n_4.pkl
2025-02-15 17:55:27,461 - INFO - Cleaning up gemma2:2b
2025-02-15 17:55:27,461 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:55:27,461 - INFO - Processing gemma2:9b
2025-02-15 17:55:27,462 - INFO - Pulling Ollama model: gemma2:9b


gemma2:9b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 17:57:59,104 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/gemma2:9b_sp_results_few_shot_n_4.pkl
2025-02-15 17:57:59,435 - INFO - Cleaning up gemma2:9b
2025-02-15 17:57:59,435 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 17:57:59,435 - INFO - Processing gemma2:27b
2025-02-15 17:57:59,435 - INFO - Pulling Ollama model: gemma2:27b


gemma2:27b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 18:01:00,459 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/gemma2:27b_sp_results_few_shot_n_4.pkl
2025-02-15 18:01:00,751 - INFO - Cleaning up gemma2:27b
2025-02-15 18:01:00,751 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:01:00,752 - INFO - Processing mistral-nemo:12b
2025-02-15 18:01:00,752 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/623 [00:00<?, ?it/s]

2025-02-15 18:02:49,594 - INFO - Creating checkpoint: results/checkpoints/sp_results_few_shot_n_4/mistral-nemo:12b_sp_results_few_shot_n_4.pkl
2025-02-15 18:02:50,218 - INFO - Cleaning up mistral-nemo:12b
2025-02-15 18:02:50,219 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:02:50,219 - INFO - Asynchronous execution complete
2025-02-15 18:02:50,219 - INFO - Dumping results to results/sp_results_few_shot_n_4.pkl


In [9]:
chat_prompt_template, riddles_for_eval = get_few_shot_chat_template(dataset.wp, n_shots)

wp_results = await executor.aexecute(
    riddles_for_eval,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    result_file_name=f"wp_results_few_shot_n_{n_shots}",
)

2025-02-15 18:02:56,198 - INFO - Starting asynchronous execution
2025-02-15 18:02:56,198 - INFO - Split dataset of 488 items into 98 batches of size 5
2025-02-15 18:02:56,199 - INFO - Processing llama3.1:8b
2025-02-15 18:02:56,199 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:03:25,118 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/llama3.1:8b_wp_results_few_shot_n_4.pkl
2025-02-15 18:03:25,335 - INFO - Cleaning up llama3.1:8b
2025-02-15 18:03:25,336 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:03:25,336 - INFO - Processing llama3.2:1b
2025-02-15 18:03:25,336 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:03:45,042 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/llama3.2:1b_wp_results_few_shot_n_4.pkl
2025-02-15 18:03:45,270 - INFO - Cleaning up llama3.2:1b
2025-02-15 18:03:45,270 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:03:45,270 - INFO - Processing llama3.2:3b
2025-02-15 18:03:45,271 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:04:09,508 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/llama3.2:3b_wp_results_few_shot_n_4.pkl
2025-02-15 18:04:09,728 - INFO - Cleaning up llama3.2:3b
2025-02-15 18:04:09,729 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:04:09,729 - INFO - Processing phi3.5:3.8b
2025-02-15 18:04:09,729 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:11:26,072 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/phi3.5:3.8b_wp_results_few_shot_n_4.pkl
2025-02-15 18:11:26,302 - INFO - Cleaning up phi3.5:3.8b
2025-02-15 18:11:26,303 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:11:26,303 - INFO - Processing phi4:14b
2025-02-15 18:11:26,303 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:12:07,615 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/phi4:14b_wp_results_few_shot_n_4.pkl
2025-02-15 18:12:07,847 - INFO - Cleaning up phi4:14b
2025-02-15 18:12:07,847 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:12:07,848 - INFO - Processing qwen2.5:0.5b
2025-02-15 18:12:07,848 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:13:27,959 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:0.5b_wp_results_few_shot_n_4.pkl
2025-02-15 18:13:28,177 - INFO - Cleaning up qwen2.5:0.5b
2025-02-15 18:13:28,178 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:13:28,178 - INFO - Processing qwen2.5:1.5b
2025-02-15 18:13:28,179 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:14:50,045 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:1.5b_wp_results_few_shot_n_4.pkl
2025-02-15 18:14:50,670 - INFO - Cleaning up qwen2.5:1.5b
2025-02-15 18:14:50,670 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:14:50,670 - INFO - Processing qwen2.5:3b
2025-02-15 18:14:50,671 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:16:13,479 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:3b_wp_results_few_shot_n_4.pkl
2025-02-15 18:16:13,712 - INFO - Cleaning up qwen2.5:3b
2025-02-15 18:16:13,713 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:16:13,713 - INFO - Processing qwen2.5:7b
2025-02-15 18:16:13,713 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:17:38,519 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:7b_wp_results_few_shot_n_4.pkl
2025-02-15 18:17:38,749 - INFO - Cleaning up qwen2.5:7b
2025-02-15 18:17:38,750 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:17:38,750 - INFO - Processing qwen2.5:14b
2025-02-15 18:17:38,750 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:19:11,796 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:14b_wp_results_few_shot_n_4.pkl
2025-02-15 18:19:12,038 - INFO - Cleaning up qwen2.5:14b
2025-02-15 18:19:12,038 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:19:12,038 - INFO - Processing qwen2.5:32b
2025-02-15 18:19:12,039 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:21:16,603 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/qwen2.5:32b_wp_results_few_shot_n_4.pkl
2025-02-15 18:21:16,830 - INFO - Cleaning up qwen2.5:32b
2025-02-15 18:21:16,831 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:21:16,831 - INFO - Processing gemma2:2b
2025-02-15 18:21:16,831 - INFO - Pulling Ollama model: gemma2:2b


gemma2:2b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:21:47,155 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/gemma2:2b_wp_results_few_shot_n_4.pkl
2025-02-15 18:21:47,880 - INFO - Cleaning up gemma2:2b
2025-02-15 18:21:47,881 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:21:47,881 - INFO - Processing gemma2:9b
2025-02-15 18:21:47,881 - INFO - Pulling Ollama model: gemma2:9b


gemma2:9b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:22:59,335 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/gemma2:9b_wp_results_few_shot_n_4.pkl
2025-02-15 18:22:59,557 - INFO - Cleaning up gemma2:9b
2025-02-15 18:22:59,558 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:22:59,558 - INFO - Processing gemma2:27b
2025-02-15 18:22:59,558 - INFO - Pulling Ollama model: gemma2:27b


gemma2:27b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:24:11,364 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/gemma2:27b_wp_results_few_shot_n_4.pkl
2025-02-15 18:24:11,591 - INFO - Cleaning up gemma2:27b
2025-02-15 18:24:11,591 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:24:11,592 - INFO - Processing mistral-nemo:12b
2025-02-15 18:24:11,592 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/488 [00:00<?, ?it/s]

2025-02-15 18:25:02,481 - INFO - Creating checkpoint: results/checkpoints/wp_results_few_shot_n_4/mistral-nemo:12b_wp_results_few_shot_n_4.pkl
2025-02-15 18:25:02,712 - INFO - Cleaning up mistral-nemo:12b
2025-02-15 18:25:02,713 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-15 18:25:02,713 - INFO - Asynchronous execution complete
2025-02-15 18:25:02,713 - INFO - Dumping results to results/wp_results_few_shot_n_4.pkl
