In [1]:
from scripts import setup_environment

setup_environment()

In [2]:
import textwrap

from langchain.prompts import PromptTemplate

from scripts.dataset import RiddleQuestion


def create_prompt(model_name: str, riddle_question: RiddleQuestion):
    prompt_template = """
    Question: {question}
    Choices:
    {choices}
    Answer: {answer}
    """

    # Prevent indentation issues, as the prompt template is indented between the triple quotes
    # and therefore the indentation is preserved in the prompt template.
    prompt_template = textwrap.dedent(prompt_template)

    prompt = PromptTemplate(
        input_variables=["question", "choices", "answer"], template=prompt_template
    )
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"Option {j + 1}: {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
        "answer": "Please return the answer in the format of the choice.",
    }

    system_prompt = """
    You are a helpful AI assistant. 
    You will be given a question with multiple choice answers. 
    Your task is to select the correct answer from the given choices."
    """
    system_prompt = textwrap.dedent(system_prompt)
    user_prompt = prompt.format(**template_args)

    # Gemma models require to alternative between user and assistant roles, do not use system role
    if "gemma" in model_name:
        return [
            {
                "role": "user",
                "content": f"{system_prompt}\n{user_prompt}",
            },
        ]

    return [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": user_prompt,
        },
    ]

In [3]:
from scripts.dataset import BrainteaserDataset
from scripts.executor import ModelExecutor

dataset = BrainteaserDataset("data")
executor = ModelExecutor(
    models=[
        "google/gemma-2-2b-it",
        "google/gemma-2-9b-it",
        "microsoft/Phi-3.5-mini-instruct",
        "meta-llama/Llama-3.1-8B-Instruct",
        "meta-llama/Llama-3.2-1B-Instruct",
        "meta-llama/Llama-3.2-3B-Instruct",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    ],
    dataset=dataset.sp_train,
)

results = executor.run(create_prompt)

2025-01-26 20:35:04,043 - INFO - Initialized ModelExecutor with 9 models and 507 riddle questions
2025-01-26 20:35:04,044 - INFO - Starting model execution run
2025-01-26 20:35:04,044 - INFO - Processing model: google/gemma-2-2b-it
2025-01-26 20:35:04,044 - INFO - Creating pipeline for model: google/gemma-2-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
2025-01-26 20:35:06,771 - INFO - Pipeline created successfully
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
2025-01-26 20:35:16,411 - INFO - google/gemma-2-2b-it execution took 9 seconds
2025-01-26 20:35:26,026 - INFO - google/gemma-2-2b-it execution took 9 seconds
2025-01-26 20:35:33,889 - INFO - google/gemma-2-2b-it execution took 7 seconds
2025-01-26 20:35:57,354 - INFO - google/gemma-2-2b-it execution took 23 seconds
2025-01-26 20:36:20,867 - INFO - google/gemma-2-2b-it execution took 23 seconds
2025-01-26 20:36:28,219 - INFO - google/gemma-2-2b-it execution took 7 seconds
2025-01-26 20:36:35,475 - INFO - google/gemma-2-2b-it execution took 7 seconds
2025-01-26 20:36:44,457 - INFO - google/gemma-2-2b-it execution took 8 seconds
2025-01-26 20:36:50,978 - INFO - google/gemma-2-2b-it execution took 6 seconds
2025-01-26 20:36:56,437 - INFO - google/gemma-

KeyboardInterrupt: 

In [None]:
print(results)

In [4]:
for model, model_results in results.items():
    print(f"Model: {model}")
    for result in model_results:
        print(f"Prompt: {result.prompt_messages()}")
        print(f"Output: {result.model_response()}")
        print("-" * 50)

Model: meta-llama/Llama-3.2-1B-Instruct
Prompt: [{'role': 'system', 'content': 'You are a helpful AI assistant. You will be given a question with multiple choice answers. Your task is to select the correct answer from the given choices.'}, {'role': 'user', 'content': '\nQuestion: Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?\nChoices:\nOption 1: Some daughters get married and have their own family.\nOption 2: Each daughter shares the same brother.\nOption 3: Some brothers were not loved by family and moved away.\nOption 4: None of above.\nAnswer: Please return the answer in the format of the choice.\n'}]
Output: Option 2: Each daughter shares the same brother.
--------------------------------------------------
Prompt: [{'role': 'system', 'content': 'You are a helpful AI assistant. You will be given a question with multiple choice answers. Your task is to select the correct answer from the give