In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
import textwrap

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

# System message templates (priming)
system_templates = {
    "default": "You are an AI assistant.",
    "default_improved": "You are an AI assistant specialized in solving lateral thinking questions. You will receive a question with multiple choices and must determine the correct answer using logical reasoning and problem-solving techniques.",
    "step_by_step": "You are a logical problem solver. Break down the question systematically, analyze each answer choice step by step, eliminate incorrect options, and select the best answer.",
    "creative": "You are a lateral thinker. Approach each question with flexible reasoning, exploring unconventional yet valid interpretations before selecting the best answer.",
    "elimination": "You are a strategic reasoner. First, identify and eliminate incorrect answer choices. Then, select the most logical remaining option.",
    "metaphor": "You are skilled in abstract reasoning. Consider both literal and metaphorical meanings in the question and choices before selecting the most insightful answer.",
    "confidence": "You are an analytical decision-maker. Assess the likelihood of correctness for each choice, score them internally, and select the answer with the highest confidence.",
    "perspective_shift": "You are a multi-perspective analyst. Evaluate the question from different angles, considering alternative interpretations before determining the best answer.",
    "common_sense": "You balance logic and practicality. Apply both structured reasoning and real-world common sense to determine the most reasonable answer.",
    "assumption_challenge": "You are a critical thinker. Identify and question hidden assumptions in the question and choices before selecting the answer that best challenges or aligns with them.",
    "pattern_matching": "You recognize patterns and relationships. Identify logical structures, recurring themes, or hidden connections in the question and choices before selecting the best answer.",
    "intuitive": "You combine intuition with logic. Generate an initial answer instinctively, then critically evaluate it for logical soundness before finalizing your choice.",
}


def get_system_prompt(template_name: str):
    system_prompt = system_templates[template_name]
    system_prompt = textwrap.dedent(system_prompt)

    system_prompt_template = SystemMessagePromptTemplate.from_template(
        system_prompt, id=template_name
    )
    return system_prompt_template


def get_user_prompt():
    prompt = """
    Please pick the best choice for the brain teaser. Each brain teaser has only one possible solution including the choice none of above, answer should only provide the choice:

    Question: {question}
    Choice:
    {choices}
    Answer:
    """

    prompt = textwrap.dedent(prompt)

    prompt_template = HumanMessagePromptTemplate.from_template(prompt)
    return prompt_template


def create_prompt_template(
    system_prompt_template_name: str = "default",
):
    system_prompt_template = get_system_prompt(system_prompt_template_name)
    user_prompt_template = get_user_prompt()
    chat_prompt_template = ChatPromptTemplate.from_messages(
        [system_prompt_template, user_prompt_template]
    )

    return chat_prompt_template

In [3]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [4]:
import string

from scripts.dataset import RiddleQuestion


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [5]:
from scripts.lmm import OllamaModel
from scripts.executor import Executor

base_url = "http://136.61.33.107:41302"
executor = Executor(
    models=[
        # Llama3.1
        OllamaModel("llama3.1:8b", base_url=base_url),
        # Llama3.2
        OllamaModel("llama3.2:1b", base_url=base_url),
        OllamaModel("llama3.2:3b", base_url=base_url),
        # Phi3.5
        OllamaModel("phi3.5:3.8b", base_url=base_url),
        # Phi4
        OllamaModel("phi4:14b", base_url=base_url),
        # Qwen2.5
        OllamaModel("qwen2.5:0.5b", base_url=base_url),
        OllamaModel("qwen2.5:1.5b", base_url=base_url),
        OllamaModel("qwen2.5:3b", base_url=base_url),
        OllamaModel("qwen2.5:7b", base_url=base_url),
        OllamaModel("qwen2.5:14b", base_url=base_url),
        OllamaModel("qwen2.5:32b", base_url=base_url),
        # Gemma2
        OllamaModel("gemma2:2b", base_url=base_url),
        OllamaModel("gemma2:9b", base_url=base_url),
        OllamaModel("gemma2:27b", base_url=base_url),
        # Mistral Nemo
        OllamaModel("mistral-nemo:12b", base_url=base_url),
    ]
)

2025-02-17 20:20:18,821 - INFO - Initialized executor with 15 models.


In [6]:
test_sample_size = 50
sp_data = dataset.sp[0:test_sample_size]
wp_data = dataset.wp[0:test_sample_size]

In [7]:
from pathlib import Path

import dill as pickle

total_results = {}

reduced_dataset = dataset.sp[0:]
for technique in system_templates:
    chat_prompt_template = create_prompt_template(technique)
    sp_results = await executor.aexecute(
        sp_data,
        chat_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        result_file_name=f"test_0/sp_results_{technique}",
    )

    wp_results = await executor.aexecute(
        wp_data,
        chat_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        result_file_name=f"test_0/wp_results_{technique}",
    )

    total_results[technique] = {"sp": sp_results, "wp": wp_results}

results_file = Path("test_0/results_test_0_system-prime-messages.pkl")
results_file.parent.mkdir(parents=True, exist_ok=True)

with results_file.open("wb") as f:
    pickle.dump(total_results, f)

2025-02-17 20:20:18,835 - INFO - Loading results from result file: results/test_0/sp_results_default.pkl
2025-02-17 20:20:18,930 - INFO - Results file is valid, returning results
2025-02-17 20:20:18,930 - INFO - Loading results from result file: results/test_0/wp_results_default.pkl
2025-02-17 20:20:18,938 - INFO - Results file is valid, returning results
2025-02-17 20:20:18,938 - INFO - Loading results from result file: results/test_0/sp_results_default_improved.pkl
2025-02-17 20:20:18,946 - INFO - Results file is valid, returning results
2025-02-17 20:20:18,946 - INFO - Loading results from result file: results/test_0/wp_results_default_improved.pkl
2025-02-17 20:20:18,955 - INFO - Results file is valid, returning results
2025-02-17 20:20:18,956 - INFO - Loading results from result file: results/test_0/sp_results_step_by_step.pkl
2025-02-17 20:20:19,049 - INFO - Results file is valid, returning results
2025-02-17 20:20:19,050 - INFO - Loading results from result file: results/test_0/

llama3.1:8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:20:41,991 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/llama3.1:8b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:20:42,002 - INFO - Cleaning up llama3.1:8b
2025-02-17 20:20:42,003 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:20:42,003 - INFO - Processing llama3.2:1b
2025-02-17 20:20:42,003 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:20:56,874 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/llama3.2:1b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:20:56,886 - INFO - Cleaning up llama3.2:1b
2025-02-17 20:20:56,886 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:20:56,887 - INFO - Processing llama3.2:3b
2025-02-17 20:20:56,887 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:21:10,414 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/llama3.2:3b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:21:10,461 - INFO - Cleaning up llama3.2:3b
2025-02-17 20:21:10,462 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:21:10,462 - INFO - Processing phi3.5:3.8b
2025-02-17 20:21:10,463 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:22:57,163 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/phi3.5:3.8b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:22:57,175 - INFO - Cleaning up phi3.5:3.8b
2025-02-17 20:22:57,176 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:22:57,176 - INFO - Processing phi4:14b
2025-02-17 20:22:57,176 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:24:14,807 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/phi4:14b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:24:14,820 - INFO - Cleaning up phi4:14b
2025-02-17 20:24:14,820 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:24:14,821 - INFO - Processing qwen2.5:0.5b
2025-02-17 20:24:14,821 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:24:59,220 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:0.5b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:24:59,232 - INFO - Cleaning up qwen2.5:0.5b
2025-02-17 20:24:59,233 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:24:59,233 - INFO - Processing qwen2.5:1.5b
2025-02-17 20:24:59,233 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:25:19,975 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:1.5b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:25:19,986 - INFO - Cleaning up qwen2.5:1.5b
2025-02-17 20:25:19,987 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:25:19,987 - INFO - Processing qwen2.5:3b
2025-02-17 20:25:19,988 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:25:39,938 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:3b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:25:39,950 - INFO - Cleaning up qwen2.5:3b
2025-02-17 20:25:39,950 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:25:39,951 - INFO - Processing qwen2.5:7b
2025-02-17 20:25:39,951 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:26:04,687 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:7b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:26:04,698 - INFO - Cleaning up qwen2.5:7b
2025-02-17 20:26:04,698 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:26:04,699 - INFO - Processing qwen2.5:14b
2025-02-17 20:26:04,699 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:26:48,008 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:14b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:26:48,019 - INFO - Cleaning up qwen2.5:14b
2025-02-17 20:26:48,019 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:26:48,019 - INFO - Processing qwen2.5:32b
2025-02-17 20:26:48,020 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:28:12,063 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/qwen2.5:32b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:28:12,074 - INFO - Cleaning up qwen2.5:32b
2025-02-17 20:28:12,074 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:28:12,074 - INFO - Processing gemma2:2b
2025-02-17 20:28:12,075 - INFO - Pulling Ollama model: gemma2:2b


gemma2:2b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:28:30,499 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/gemma2:2b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:28:30,510 - INFO - Cleaning up gemma2:2b
2025-02-17 20:28:30,511 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:28:30,511 - INFO - Processing gemma2:9b
2025-02-17 20:28:30,512 - INFO - Pulling Ollama model: gemma2:9b


gemma2:9b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:29:01,907 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/gemma2:9b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:29:01,921 - INFO - Cleaning up gemma2:9b
2025-02-17 20:29:01,921 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:29:01,922 - INFO - Processing gemma2:27b
2025-02-17 20:29:01,922 - INFO - Pulling Ollama model: gemma2:27b
2025-02-17 20:29:24,875 - ERROR - Error pulling Ollama model: write /root/.ollama/models/blobs/sha256-d7e4b00a7d7a8d03d4eed9b0f3f61a427e9f0fc5dea6aeb414e41dee23dc8ecc-partial: no space left on device (status code: 500)
2025-02-17 20:29:24,875 - INFO - Deleting all ollama models to free up space


gemma2:27b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:30:33,525 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/gemma2:27b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:30:33,537 - INFO - Cleaning up gemma2:27b
2025-02-17 20:30:33,538 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:30:33,538 - INFO - Processing mistral-nemo:12b
2025-02-17 20:30:33,539 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:31:05,700 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_pattern_matching/mistral-nemo:12b_test_0/sp_results_pattern_matching.pkl
2025-02-17 20:31:05,713 - INFO - Cleaning up mistral-nemo:12b
2025-02-17 20:31:05,713 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:31:05,713 - INFO - Asynchronous execution complete
2025-02-17 20:31:05,713 - INFO - Dumping results to results/test_0/sp_results_pattern_matching.pkl
2025-02-17 20:31:05,849 - INFO - Starting asynchronous execution
2025-02-17 20:31:05,850 - INFO - Split dataset of 50 items into 10 batches of size 5
2025-02-17 20:31:05,850 - INFO - Processing llama3.1:8b
2025-02-17 20:31:05,850 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:31:30,991 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/llama3.1:8b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:31:31,002 - INFO - Cleaning up llama3.1:8b
2025-02-17 20:31:31,003 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:31:31,003 - INFO - Processing llama3.2:1b
2025-02-17 20:31:31,004 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:31:47,527 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/llama3.2:1b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:31:47,540 - INFO - Cleaning up llama3.2:1b
2025-02-17 20:31:47,541 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:31:47,541 - INFO - Processing llama3.2:3b
2025-02-17 20:31:47,542 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:32:01,377 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/llama3.2:3b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:32:01,388 - INFO - Cleaning up llama3.2:3b
2025-02-17 20:32:01,388 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:32:01,389 - INFO - Processing phi3.5:3.8b
2025-02-17 20:32:01,389 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:34:12,749 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/phi3.5:3.8b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:34:12,762 - INFO - Cleaning up phi3.5:3.8b
2025-02-17 20:34:12,763 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:34:12,763 - INFO - Processing phi4:14b
2025-02-17 20:34:12,763 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:35:39,363 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/phi4:14b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:35:39,375 - INFO - Cleaning up phi4:14b
2025-02-17 20:35:39,375 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:35:39,375 - INFO - Processing qwen2.5:0.5b
2025-02-17 20:35:39,375 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:36:10,212 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:0.5b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:36:10,224 - INFO - Cleaning up qwen2.5:0.5b
2025-02-17 20:36:10,224 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:36:10,225 - INFO - Processing qwen2.5:1.5b
2025-02-17 20:36:10,226 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:36:26,927 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:1.5b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:36:26,939 - INFO - Cleaning up qwen2.5:1.5b
2025-02-17 20:36:26,940 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:36:26,940 - INFO - Processing qwen2.5:3b
2025-02-17 20:36:26,940 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:36:43,910 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:3b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:36:43,925 - INFO - Cleaning up qwen2.5:3b
2025-02-17 20:36:43,926 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:36:43,926 - INFO - Processing qwen2.5:7b
2025-02-17 20:36:43,927 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:37:08,787 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:7b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:37:08,798 - INFO - Cleaning up qwen2.5:7b
2025-02-17 20:37:08,798 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:37:08,798 - INFO - Processing qwen2.5:14b
2025-02-17 20:37:08,798 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:38:00,228 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:14b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:38:00,239 - INFO - Cleaning up qwen2.5:14b
2025-02-17 20:38:00,239 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:38:00,240 - INFO - Processing qwen2.5:32b
2025-02-17 20:38:00,240 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:39:41,077 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/qwen2.5:32b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:39:41,090 - INFO - Cleaning up qwen2.5:32b
2025-02-17 20:39:41,091 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:39:41,091 - INFO - Processing gemma2:2b
2025-02-17 20:39:41,091 - INFO - Pulling Ollama model: gemma2:2b


gemma2:2b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:39:57,108 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/gemma2:2b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:39:57,122 - INFO - Cleaning up gemma2:2b
2025-02-17 20:39:57,122 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:39:57,123 - INFO - Processing gemma2:9b
2025-02-17 20:39:57,123 - INFO - Pulling Ollama model: gemma2:9b
2025-02-17 20:40:08,532 - ERROR - Error pulling Ollama model: write /root/.ollama/models/blobs/sha256-ff1d1fc78170d787ee1201778e2dd65ea211654ca5fb7d69b5a2e7b123a50373-partial: no space left on device (status code: 500)
2025-02-17 20:40:08,532 - INFO - Deleting all ollama models to free up space


gemma2:9b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:40:54,603 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/gemma2:9b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:40:54,614 - INFO - Cleaning up gemma2:9b
2025-02-17 20:40:54,615 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:40:54,615 - INFO - Processing gemma2:27b
2025-02-17 20:40:54,616 - INFO - Pulling Ollama model: gemma2:27b


gemma2:27b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:42:10,849 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/gemma2:27b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:42:10,861 - INFO - Cleaning up gemma2:27b
2025-02-17 20:42:10,861 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:42:10,861 - INFO - Processing mistral-nemo:12b
2025-02-17 20:42:10,862 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:42:49,480 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_pattern_matching/mistral-nemo:12b_test_0/wp_results_pattern_matching.pkl
2025-02-17 20:42:49,490 - INFO - Cleaning up mistral-nemo:12b
2025-02-17 20:42:49,491 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:42:49,491 - INFO - Asynchronous execution complete
2025-02-17 20:42:49,491 - INFO - Dumping results to results/test_0/wp_results_pattern_matching.pkl
2025-02-17 20:42:49,633 - INFO - Starting asynchronous execution
2025-02-17 20:42:49,633 - INFO - Split dataset of 50 items into 10 batches of size 5
2025-02-17 20:42:49,634 - INFO - Processing llama3.1:8b
2025-02-17 20:42:49,634 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:43:18,386 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/llama3.1:8b_test_0/sp_results_intuitive.pkl
2025-02-17 20:43:18,398 - INFO - Cleaning up llama3.1:8b
2025-02-17 20:43:18,399 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:43:18,399 - INFO - Processing llama3.2:1b
2025-02-17 20:43:18,399 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:43:34,129 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/llama3.2:1b_test_0/sp_results_intuitive.pkl
2025-02-17 20:43:34,140 - INFO - Cleaning up llama3.2:1b
2025-02-17 20:43:34,140 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:43:34,141 - INFO - Processing llama3.2:3b
2025-02-17 20:43:34,141 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:43:48,984 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/llama3.2:3b_test_0/sp_results_intuitive.pkl
2025-02-17 20:43:48,995 - INFO - Cleaning up llama3.2:3b
2025-02-17 20:43:48,996 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:43:48,996 - INFO - Processing phi3.5:3.8b
2025-02-17 20:43:48,996 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:46:08,608 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/phi3.5:3.8b_test_0/sp_results_intuitive.pkl
2025-02-17 20:46:08,621 - INFO - Cleaning up phi3.5:3.8b
2025-02-17 20:46:08,622 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:46:08,622 - INFO - Processing phi4:14b
2025-02-17 20:46:08,622 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:48:10,156 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/phi4:14b_test_0/sp_results_intuitive.pkl
2025-02-17 20:48:10,167 - INFO - Cleaning up phi4:14b
2025-02-17 20:48:10,167 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:48:10,168 - INFO - Processing qwen2.5:0.5b
2025-02-17 20:48:10,168 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:48:58,379 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:0.5b_test_0/sp_results_intuitive.pkl
2025-02-17 20:48:58,391 - INFO - Cleaning up qwen2.5:0.5b
2025-02-17 20:48:58,391 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:48:58,392 - INFO - Processing qwen2.5:1.5b
2025-02-17 20:48:58,392 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:49:29,928 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:1.5b_test_0/sp_results_intuitive.pkl
2025-02-17 20:49:29,939 - INFO - Cleaning up qwen2.5:1.5b
2025-02-17 20:49:29,939 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:49:29,939 - INFO - Processing qwen2.5:3b
2025-02-17 20:49:29,940 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:50:32,313 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:3b_test_0/sp_results_intuitive.pkl
2025-02-17 20:50:32,329 - INFO - Cleaning up qwen2.5:3b
2025-02-17 20:50:32,330 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:50:32,330 - INFO - Processing qwen2.5:7b
2025-02-17 20:50:32,330 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:51:26,931 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:7b_test_0/sp_results_intuitive.pkl
2025-02-17 20:51:26,942 - INFO - Cleaning up qwen2.5:7b
2025-02-17 20:51:26,942 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:51:26,942 - INFO - Processing qwen2.5:14b
2025-02-17 20:51:26,943 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:52:57,096 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:14b_test_0/sp_results_intuitive.pkl
2025-02-17 20:52:57,109 - INFO - Cleaning up qwen2.5:14b
2025-02-17 20:52:57,109 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:52:57,110 - INFO - Processing qwen2.5:32b
2025-02-17 20:52:57,110 - INFO - Pulling Ollama model: qwen2.5:32b


qwen2.5:32b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:54:38,346 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/qwen2.5:32b_test_0/sp_results_intuitive.pkl
2025-02-17 20:54:38,357 - INFO - Cleaning up qwen2.5:32b
2025-02-17 20:54:38,357 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:54:38,357 - INFO - Processing gemma2:2b
2025-02-17 20:54:38,358 - INFO - Pulling Ollama model: gemma2:2b
2025-02-17 20:54:41,158 - ERROR - Error pulling Ollama model: write /root/.ollama/models/blobs/sha256-7462734796d67c40ecec2ca98eddf970e171dbb6b370e43fd633ee75b69abe1b-partial: no space left on device (status code: 500)
2025-02-17 20:54:41,158 - INFO - Deleting all ollama models to free up space


gemma2:2b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:55:29,042 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/gemma2:2b_test_0/sp_results_intuitive.pkl
2025-02-17 20:55:29,057 - INFO - Cleaning up gemma2:2b
2025-02-17 20:55:29,058 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:55:29,058 - INFO - Processing gemma2:9b
2025-02-17 20:55:29,058 - INFO - Pulling Ollama model: gemma2:9b


gemma2:9b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:56:29,133 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/gemma2:9b_test_0/sp_results_intuitive.pkl
2025-02-17 20:56:29,145 - INFO - Cleaning up gemma2:9b
2025-02-17 20:56:29,145 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:56:29,146 - INFO - Processing gemma2:27b
2025-02-17 20:56:29,146 - INFO - Pulling Ollama model: gemma2:27b


gemma2:27b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:58:00,019 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/gemma2:27b_test_0/sp_results_intuitive.pkl
2025-02-17 20:58:00,030 - INFO - Cleaning up gemma2:27b
2025-02-17 20:58:00,030 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:58:00,031 - INFO - Processing mistral-nemo:12b
2025-02-17 20:58:00,031 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:58:38,983 - INFO - Creating checkpoint: results/checkpoints/test_0/sp_results_intuitive/mistral-nemo:12b_test_0/sp_results_intuitive.pkl
2025-02-17 20:58:38,995 - INFO - Cleaning up mistral-nemo:12b
2025-02-17 20:58:38,995 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:58:38,996 - INFO - Asynchronous execution complete
2025-02-17 20:58:38,996 - INFO - Dumping results to results/test_0/sp_results_intuitive.pkl
2025-02-17 20:58:39,138 - INFO - Starting asynchronous execution
2025-02-17 20:58:39,139 - INFO - Split dataset of 50 items into 10 batches of size 5
2025-02-17 20:58:39,139 - INFO - Processing llama3.1:8b
2025-02-17 20:58:39,139 - INFO - Pulling Ollama model: llama3.1:8b


llama3.1:8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:59:03,947 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/llama3.1:8b_test_0/wp_results_intuitive.pkl
2025-02-17 20:59:03,963 - INFO - Cleaning up llama3.1:8b
2025-02-17 20:59:03,964 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:59:03,964 - INFO - Processing llama3.2:1b
2025-02-17 20:59:03,965 - INFO - Pulling Ollama model: llama3.2:1b


llama3.2:1b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:59:19,422 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/llama3.2:1b_test_0/wp_results_intuitive.pkl
2025-02-17 20:59:19,432 - INFO - Cleaning up llama3.2:1b
2025-02-17 20:59:19,433 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:59:19,433 - INFO - Processing llama3.2:3b
2025-02-17 20:59:19,433 - INFO - Pulling Ollama model: llama3.2:3b


llama3.2:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 20:59:33,603 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/llama3.2:3b_test_0/wp_results_intuitive.pkl
2025-02-17 20:59:33,614 - INFO - Cleaning up llama3.2:3b
2025-02-17 20:59:33,614 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 20:59:33,615 - INFO - Processing phi3.5:3.8b
2025-02-17 20:59:33,615 - INFO - Pulling Ollama model: phi3.5:3.8b


phi3.5:3.8b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:01:48,753 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/phi3.5:3.8b_test_0/wp_results_intuitive.pkl
2025-02-17 21:01:48,766 - INFO - Cleaning up phi3.5:3.8b
2025-02-17 21:01:48,766 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:01:48,767 - INFO - Processing phi4:14b
2025-02-17 21:01:48,767 - INFO - Pulling Ollama model: phi4:14b


phi4:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:04:13,687 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/phi4:14b_test_0/wp_results_intuitive.pkl
2025-02-17 21:04:13,698 - INFO - Cleaning up phi4:14b
2025-02-17 21:04:13,699 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:04:13,699 - INFO - Processing qwen2.5:0.5b
2025-02-17 21:04:13,700 - INFO - Pulling Ollama model: qwen2.5:0.5b


qwen2.5:0.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:04:55,485 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:0.5b_test_0/wp_results_intuitive.pkl
2025-02-17 21:04:55,498 - INFO - Cleaning up qwen2.5:0.5b
2025-02-17 21:04:55,498 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:04:55,498 - INFO - Processing qwen2.5:1.5b
2025-02-17 21:04:55,499 - INFO - Pulling Ollama model: qwen2.5:1.5b


qwen2.5:1.5b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:05:19,338 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:1.5b_test_0/wp_results_intuitive.pkl
2025-02-17 21:05:19,349 - INFO - Cleaning up qwen2.5:1.5b
2025-02-17 21:05:19,349 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:05:19,350 - INFO - Processing qwen2.5:3b
2025-02-17 21:05:19,350 - INFO - Pulling Ollama model: qwen2.5:3b


qwen2.5:3b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:06:15,667 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:3b_test_0/wp_results_intuitive.pkl
2025-02-17 21:06:15,681 - INFO - Cleaning up qwen2.5:3b
2025-02-17 21:06:15,682 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:06:15,682 - INFO - Processing qwen2.5:7b
2025-02-17 21:06:15,682 - INFO - Pulling Ollama model: qwen2.5:7b


qwen2.5:7b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:07:27,084 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:7b_test_0/wp_results_intuitive.pkl
2025-02-17 21:07:27,096 - INFO - Cleaning up qwen2.5:7b
2025-02-17 21:07:27,096 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:07:27,096 - INFO - Processing qwen2.5:14b
2025-02-17 21:07:27,097 - INFO - Pulling Ollama model: qwen2.5:14b


qwen2.5:14b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:09:23,258 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:14b_test_0/wp_results_intuitive.pkl
2025-02-17 21:09:23,271 - INFO - Cleaning up qwen2.5:14b
2025-02-17 21:09:23,272 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:09:23,272 - INFO - Processing qwen2.5:32b
2025-02-17 21:09:23,272 - INFO - Pulling Ollama model: qwen2.5:32b
2025-02-17 21:10:05,406 - ERROR - Error pulling Ollama model: write /root/.ollama/models/blobs/sha256-eabc98a9bcbfce7fd70f3e07de599f8fda98120fefed5881934161ede8bd1a41-partial: no space left on device (status code: 500)
2025-02-17 21:10:05,407 - INFO - Deleting all ollama models to free up space


qwen2.5:32b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:11:53,177 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/qwen2.5:32b_test_0/wp_results_intuitive.pkl
2025-02-17 21:11:53,188 - INFO - Cleaning up qwen2.5:32b
2025-02-17 21:11:53,189 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:11:53,189 - INFO - Processing gemma2:2b
2025-02-17 21:11:53,189 - INFO - Pulling Ollama model: gemma2:2b


gemma2:2b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:12:39,041 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/gemma2:2b_test_0/wp_results_intuitive.pkl
2025-02-17 21:12:39,051 - INFO - Cleaning up gemma2:2b
2025-02-17 21:12:39,052 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:12:39,052 - INFO - Processing gemma2:9b
2025-02-17 21:12:39,052 - INFO - Pulling Ollama model: gemma2:9b


gemma2:9b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:13:32,613 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/gemma2:9b_test_0/wp_results_intuitive.pkl
2025-02-17 21:13:32,625 - INFO - Cleaning up gemma2:9b
2025-02-17 21:13:32,625 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:13:32,626 - INFO - Processing gemma2:27b
2025-02-17 21:13:32,626 - INFO - Pulling Ollama model: gemma2:27b


gemma2:27b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:15:26,140 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/gemma2:27b_test_0/wp_results_intuitive.pkl
2025-02-17 21:15:26,151 - INFO - Cleaning up gemma2:27b
2025-02-17 21:15:26,151 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:15:26,152 - INFO - Processing mistral-nemo:12b
2025-02-17 21:15:26,152 - INFO - Pulling Ollama model: mistral-nemo:12b


mistral-nemo:12b:   0%|          | 0/50 [00:00<?, ?it/s]

2025-02-17 21:16:09,847 - INFO - Creating checkpoint: results/checkpoints/test_0/wp_results_intuitive/mistral-nemo:12b_test_0/wp_results_intuitive.pkl
2025-02-17 21:16:09,860 - INFO - Cleaning up mistral-nemo:12b
2025-02-17 21:16:09,860 - INFO - Ollama models will be deleted on demand and therefore this step is skipped!
2025-02-17 21:16:09,861 - INFO - Asynchronous execution complete
2025-02-17 21:16:09,861 - INFO - Dumping results to results/test_0/wp_results_intuitive.pkl
