In [9]:
from scripts import setup_environment

setup_environment()

In [10]:
from scripts.dataset import BrainteaserDataset

dataset = BrainteaserDataset("data")

In [11]:
import string

from scripts.dataset import RiddleQuestion
from scripts.prompt_helpers import create_prompt_template


def args_generator(riddle_question: RiddleQuestion):
    template_args = {
        "question": riddle_question.question,
        "choices": "\n".join(
            [
                f"({string.ascii_uppercase[j]}) {choice}"
                for j, choice in enumerate(riddle_question.choice_list)
            ]
        ),
    }

    return template_args

In [12]:
from scripts.lmm import OllamaModelBuilder
from scripts.executor import Executor

base_url = "http://50.173.30.254:40106"
model_builder = OllamaModelBuilder(base_url)

executor = Executor(
    models=[
        # Llama3.1
        model_builder.build_model("llama3.1:8b-instruct-q8_0"),  # => 9 GB
        # Llama3.2
        model_builder.build_model("llama3.2:1b-instruct-fp16"),  # => 2.5 GB
        model_builder.build_model("llama3.2:3b-instruct-fp16"),  # => 6.4 GB
        # Phi3.5
        model_builder.build_model("phi3.5:3.8b-mini-instruct-fp16"),  # => 7.6 GB
        # Phi4
        model_builder.build_model("phi4:14b-q8_0"),  # => 16 GB
        # Qwen2.5
        model_builder.build_model("qwen2.5:0.5b-instruct-fp16"),  # => 1 GB
        model_builder.build_model("qwen2.5:1.5b-instruct-fp16"),  # => 3.1 GB
        model_builder.build_model("qwen2.5:3b-instruct-fp16"),  # => 6.2 GB
        model_builder.build_model("qwen2.5:7b-instruct-q8_0"),  # => 8.1 GB
        model_builder.build_model("qwen2.5:14b-instruct-q8_0"),  # => 16 GB
        model_builder.build_model("qwen2.5:32b-instruct-q4_K_M"),  # => 20 GB
        # Gemma2
        model_builder.build_model("gemma2:2b-instruct-fp16"),  # => 5.2 GB
        model_builder.build_model("gemma2:9b-instruct-q8_0"),  # => 9.8 GB
        model_builder.build_model("gemma2:27b-instruct-q4_K_M"),  # => 22 GB
        # Mistral Nemo
        model_builder.build_model("mistral-nemo:12b-instruct-2407-q8_0"),  # => 13 GB
    ]
)

2025-03-11 20:01:29,231 - INFO - Initialized executor with 15 models.


## Baseline Zero-Shot Evaluation

Testing performance with the minimal default system prompt: `You are an AI assistant.` without any task-specific instructions


In [13]:
from scripts.executor import Dataset

executor_data = [
    Dataset(name="sp", riddles=dataset.sp),
    Dataset(name="wp", riddles=dataset.wp),
]
chat_prompt_template = create_prompt_template("default")
wrapped_results_baseline = await executor.aexecute(
    executor_data,
    chat_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    run_name="baseline-zero-shot-evaluation",
)

2025-03-11 20:01:29,288 - INFO - Starting execution 'baseline-zero-shot-evaluation': 2 dataset(s) x 15 model(s) = 16785 riddle evaluations


baseline-zero-shot-evaluation:   0%|          | 0/16785 [00:00<?, ?it/s]

## Model-Specific System Prompt Optimization: Comparative Analysis of Zero-Shot Performance


In [14]:
import dill as pickle

# Get the best prompt type for each model
with open("results/best_system_prompts_by_model.pkl", "rb") as f:
    best_prompt_types = pickle.load(f)


def create_prompt_template_by_model(
    model_name: str,
    dataset_name: str,
):
    best_system_template_name = best_prompt_types[model_name][dataset_name][
        "prompt_type"
    ]
    return create_prompt_template(best_system_template_name)


def get_prompt_template(model_name: str, dataset: Dataset):
    # Split name after the b paramer, e.g., llama3.1:8b-instruct-fp16 => llama3.1:8b
    model_name = model_name[0 : model_name.index("b-") + 1]
    chat_prompt_template = create_prompt_template_by_model(model_name, dataset.name)
    return chat_prompt_template

In [15]:
wrapped_results_optimized = await executor.aexecute(
    executor_data,
    get_prompt_template,
    args_generator,
    dump_to_pickle=True,
    create_checkpoints=True,
    resume_from_checkpoint=True,
    run_name="system_optimized_zero_shot_evaluation",
)

2025-03-11 20:01:30,741 - INFO - Starting execution 'system-optimized-zero-shot-evaluation': 2 dataset(s) x 15 model(s) = 16785 riddle evaluations


system-optimized-zero-shot-evaluation:   0%|          | 0/16785 [00:00<?, ?it/s]

## Diagrams


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scripts.executor import WrappedResults
from scripts.evaluation import (
    calculate_model_accuracy,
)


# Extract results into a DataFrame for easier plotting
def wrapped_results_to_pd_frame(wrapped_results: WrappedResults) -> pd.DataFrame:
    data = []
    for dataset_name, dataset_results in wrapped_results.results.items():
        for model_name, model_results in dataset_results.items():
            raw_accuracy_percentage, _, postprocessed_accuracy_percentage, _ = (
                calculate_model_accuracy(model_results)
            )
            data.append(
                {
                    "Dataset": dataset_name,
                    "Model": model_name,
                    "Accuracy(raw)": raw_accuracy_percentage,
                    "Accuracy(postprocessed)": postprocessed_accuracy_percentage,
                }
            )
    return pd.DataFrame(data)


def plot_model_accuracy(wrapped_results: WrappedResults):
    # Create DataFrame
    results_df = wrapped_results_to_pd_frame(wrapped_results)

    # Set plot style
    plt.style.use("ggplot")
    sns.set(font_scale=1.2)

    # Create a figure with appropriate size
    plt.figure(figsize=(16, 10))

    # Create grouped bar chart with both metrics for each dataset
    ax_raw = sns.barplot(
        x="Model",
        y="Accuracy(raw)",
        hue="Dataset",
        data=results_df,
        palette="viridis",
        errorbar=None,
    )

    # Add the postprocessed bars
    ax_postprocessed = sns.barplot(  # noqa: F841
        x="Model",
        y="Accuracy(postprocessed)",
        hue="Dataset",
        data=results_df,
        palette="viridis",
        errorbar=None,
        alpha=0.6,  # Make them slightly transparent to distinguish
        ax=ax_raw,
    )

    # Customize the plot
    plt.title(f"Model Accuracy by Dataset ({wrapped_results.run_name})", fontsize=16)
    plt.xlabel("Model", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xticks(rotation=45, ha="right")
    plt.ylim(0, 100)  # Assuming accuracy is between 0 and 100

    # Create a custom legend
    from matplotlib.patches import Patch

    legend_elements = []
    datasets = results_df["Dataset"].unique()
    colors = sns.color_palette("viridis", len(datasets))

    for i, dataset in enumerate(datasets):
        legend_elements.append(Patch(facecolor=colors[i], label=f"{dataset} (Raw)"))
        legend_elements.append(
            Patch(facecolor=colors[i], alpha=0.6, label=f"{dataset} (Processed)")
        )

    plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc="upper left")

    # Adjust layout
    plt.tight_layout()

    # Show the plot
    plt.show()


def plot_accuracy_delta(baseline_results, optimized_results):
    """
    Plot the delta (difference) in accuracy between optimized and baseline results.

    Args:
        baseline_results: Wrapped results from baseline evaluation
        optimized_results: Wrapped results from optimized evaluation
    """
    # Extract dataframes from both result sets using the provided function
    baseline_df = wrapped_results_to_pd_frame(baseline_results)
    optimized_df = wrapped_results_to_pd_frame(optimized_results)

    # Merge dataframes to calculate delta
    merged_df = pd.merge(
        baseline_df,
        optimized_df,
        on=["Model", "Dataset"],
        suffixes=("_baseline", "_optimized"),
    )

    # Calculate deltas
    merged_df["Raw_Delta"] = (
        merged_df["Accuracy(raw)_optimized"] - merged_df["Accuracy(raw)_baseline"]
    )
    merged_df["Processed_Delta"] = (
        merged_df["Accuracy(postprocessed)_optimized"]
        - merged_df["Accuracy(postprocessed)_baseline"]
    )

    display(merged_df)
    # Create a figure for the delta plot
    plt.figure(figsize=(16, 10))

    # Plot the deltas
    ax = sns.barplot(
        x="Model",
        y="Raw_Delta",
        hue="Dataset",
        data=merged_df,
        palette="coolwarm",
        errorbar=None,
    )

    # Add the processed deltas
    ax_processed = sns.barplot(  # noqa: F841
        x="Model",
        y="Processed_Delta",
        hue="Dataset",
        data=merged_df,
        palette="coolwarm",
        errorbar=None,
        alpha=0.6,
        ax=ax,
    )

    # Add a horizontal line at y=0
    plt.axhline(y=0, color="black", linestyle="-", alpha=0.3)

    # Customize the plot
    plt.title(
        f"Accuracy Improvement: {optimized_results.run_name} vs {baseline_results.run_name}",
        fontsize=16,
    )
    plt.xlabel("Model", fontsize=14)
    plt.ylabel("Accuracy Improvement (percentage points)", fontsize=14)
    plt.xticks(rotation=45, ha="right")

    # Create a custom legend
    from matplotlib.patches import Patch

    legend_elements = []
    datasets = merged_df["Dataset"].unique()
    colors = sns.color_palette("coolwarm", len(datasets))

    for i, dataset in enumerate(datasets):
        legend_elements.append(Patch(facecolor=colors[i], label=f"{dataset} (Raw)"))
        legend_elements.append(
            Patch(facecolor=colors[i], alpha=0.6, label=f"{dataset} (Processed)")
        )

    plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc="upper left")

    # Adjust layout
    plt.tight_layout()

    # Show the plot
    plt.show()

    # Return summary statistics
    print(
        f"Average raw accuracy improvement: {merged_df['Raw_Delta'].mean():.2f} percentage points"
    )
    print(
        f"Average processed accuracy improvement: {merged_df['Processed_Delta'].mean():.2f} percentage points"
    )

    # Show models with biggest improvements
    print("\nTop 3 models with biggest raw accuracy improvements:")
    top_models = (
        merged_df.groupby("Model")["Processed_Delta"]
        .mean()
        .sort_values(ascending=False)
    )
    for model, delta in top_models.items():
        print(f"  {model}: {delta:.2f} percentage points")


plot_accuracy_delta(wrapped_results_baseline, wrapped_results_optimized)