In [8]:
from scripts import setup_environment

setup_environment()

In [9]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [10]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args


chat_prompt_template = create_prompt_template("default")

In [11]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://142.214.185.26:40001"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b-instruct-q8_0"),  # => 9 GB
        # Llama3.2
        model_builder.build_model("llama3.2:1b-instruct-fp16"),  # => 2.5 GB
        model_builder.build_model("llama3.2:3b-instruct-fp16"),  # => 6.4 GB
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b-mini-instruct-fp16"),  # => 7.6 GB
        # Phi4
        model_builder.build_model("phi4:14b-q8_0"),  # => 16 GB
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b-instruct-fp16"),  # => 1 GB
        model_builder.build_model("qwen2.5:1.5b-instruct-fp16"),  # => 3.1 GB
        model_builder.build_model("qwen2.5:3b-instruct-fp16"),  # => 6.2 GB
        model_builder.build_model("qwen2.5:7b-instruct-q8_0"),  # => 8.1 GB
        model_builder.build_model("qwen2.5:14b-instruct-q8_0"),  # => 16 GB
        model_builder.build_model("qwen2.5:32b-instruct-q4_K_M"),  # => 20 GB
        # Gemma2
        model_builder.build_model("gemma2:2b-instruct-fp16"),  # => 5.2 GB
        model_builder.build_model("gemma2:9b-instruct-q8_0"),  # => 9.8 GB
        model_builder.build_model("gemma2:27b-instruct-q4_K_M"),  # => 22 GB
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b-instruct-2407-q8_0"),  # => 13 GB
    ]
)

2025-03-14 17:00:53,526 - INFO - Initialized executor with 15 models.


In [12]:
import numpy as np

from scripts.executor import Dataset

# Set fixed seed for reproducibility
np.random.seed(42)


def create_test_dataset(data: list[RiddleQuestion], name: str, percentage: float = 0.1):
    """Create a test dataset by randomly sampling a percentage of the original data."""
    indices = np.random.choice(
        len(data), size=int(len(data) * percentage), replace=False
    )
    return Dataset(name=name, riddles=[data[i] for i in indices])


# Create test datasets
sp_data = create_test_dataset(dataset.sp, "sp")
wp_data = create_test_dataset(dataset.wp, "wp")

# Prepare executor data
executor_data = [sp_data, wp_data]

In [13]:
import dill as pickle

# Get the best prompt type for each model
with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)


def create_prompt_template_by_model(
    model_name: str,
    dataset_name: str,
):
    best_system_template_name = best_prompt_types[model_name][dataset_name][
        "prompt_type"
    ]
    return create_prompt_template(best_system_template_name)


def get_prompt_template(model_name: str, dataset: Dataset):
    # Split name after the b paramer, e.g., llama3.1:8b-instruct-fp16 => llama3.1:8b
    model_name = model_name[0 : model_name.index("b-") + 1]
    chat_prompt_template = create_prompt_template_by_model(model_name, dataset.name)
    return chat_prompt_template

In [14]:
runs = []
for run_index in range(5):
    results, time_per_model = await executor.aexecute(
        executor_data,
        get_prompt_template,
        args_generator,
        dump_to_pickle=True,
        create_checkpoints=True,
        resume_from_checkpoint=True,
        run_name="model_speed",
        file_name_suffix=f"run_{run_index}",
    )
    runs.append((results, time_per_model))

2025-03-14 17:00:53,576 - INFO - Starting execution 'model-speed with suffix 'run_0'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


model-speed(run-0):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-14 17:37:19,199 - INFO - Saving results to results/model-speed/model-speed_run-0_results.pkl
2025-03-14 17:37:19,696 - INFO - Execution 'model-speed with suffix 'run_0'' completed successfully.
2025-03-14 17:37:19,697 - INFO - Starting execution 'model-speed with suffix 'run_1'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


model-speed(run-1):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-14 18:00:47,564 - INFO - Saving results to results/model-speed/model-speed_run-1_results.pkl
2025-03-14 18:00:48,069 - INFO - Execution 'model-speed with suffix 'run_1'' completed successfully.
2025-03-14 18:00:48,070 - INFO - Starting execution 'model-speed with suffix 'run_2'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


model-speed(run-2):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-14 18:25:06,510 - INFO - Saving results to results/model-speed/model-speed_run-2_results.pkl
2025-03-14 18:25:06,980 - INFO - Execution 'model-speed with suffix 'run_2'' completed successfully.
2025-03-14 18:25:06,981 - INFO - Starting execution 'model-speed with suffix 'run_3'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


model-speed(run-3):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-14 18:47:15,357 - INFO - Saving results to results/model-speed/model-speed_run-3_results.pkl
2025-03-14 18:47:15,847 - INFO - Execution 'model-speed with suffix 'run_3'' completed successfully.
2025-03-14 18:47:15,847 - INFO - Starting execution 'model-speed with suffix 'run_4'': 2 dataset(s) x 15 model(s) = 1665 riddle evaluations


model-speed(run-4):   0%|          | 0/1665 [00:00<?, ?it/s]

2025-03-14 19:11:45,852 - INFO - Saving results to results/model-speed/model-speed_run-4_results.pkl
2025-03-14 19:11:46,344 - INFO - Execution 'model-speed with suffix 'run_4'' completed successfully.


## Eval results
