# Data

In [None]:
models = [
    "gpt-4-turbo",
    "gpt-3.5-turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
]
tags = [
    "benchmark_gpt-4-turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_gpt-3.5-turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
]

#### expanded tags and models

In [None]:
expanded_models = [
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    "together_ai/mistralai/Mixtral-8x22B-Instruct-v0.1",
    "together_ai/Qwen/Qwen1.5-72B-Chat",
    "together_ai/Qwen/Qwen2-72B-Instruct",
    "together_ai/Qwen/Qwen1.5-110B-Chat",
    "together_ai/meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
    "together_ai/meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
    "together_ai/deepseek-ai/deepseek-llm-67b-chat",
]
expanded_tags = [
    "benchmark_together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/mistralai/Mixtral-8x22B-Instruct-v0.1_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/Qwen/Qwen1.5-72B-Chat_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/Qwen/Qwen2-72B-Instruct_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/Qwen/Qwen1.5-110B-Chat_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3-8B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/deepseek-ai/deepseek-llm-67b-chat_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
]

### Models mapping

In [None]:
models_mapping = {
    "gpt-4-turbo": "GPT-4-turbo",
    "gpt-3.5-turbo": "GPT-3.5-turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": "Llama3.1-405B",
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "Llama3.1-70B",
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "Llama3.1-8B",
    "together_ai/mistralai/Mixtral-8x22B-Instruct-v0.1": "Mixtral-8x22B",
    "together_ai/Qwen/Qwen1.5-72B-Chat": "Qwen1.5-72B-Chat",
    "together_ai/Qwen/Qwen2-72B-Instruct": "Qwen2-72B-Instruct",
    "together_ai/Qwen/Qwen1.5-110B-Chat": "Qwen1.5-110B-Chat",
    "together_ai/meta-llama/Meta-Llama-3-70B-Instruct-Turbo": "Llama3-70B",
    "together_ai/meta-llama/Meta-Llama-3-8B-Instruct-Turbo": "Llama3-8B",
    "together_ai/deepseek-ai/deepseek-llm-67b-chat": "DeepSeek-67B",
}

# Sec 6.1: main results

In [None]:
# get performance and ratio data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

performance_data = {}
ratio_data = {}
model_rewards = {}
all_episodes = []
for model, tag in zip(models, tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    print(f"model: {model}, tag: {tag}, episodes: {len(episodes)}")
    avg_rewards = get_avg_reward(episodes, model)  # type: ignore
    binary_avg_rewards = get_avg_reward(episodes, model, binary=True)  # type: ignore
    performance_data[model] = avg_rewards
    ratio_data[model] = binary_avg_rewards
    all_episodes.extend(episodes)

import copy

expanded_performance_data = copy.deepcopy(performance_data)
expanded_ratio_data = copy.deepcopy(ratio_data)
for model, tag in zip(expanded_models, expanded_tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    print(f"model: {model}, tag: {tag}, episodes: {len(episodes)}")
    avg_rewards = get_avg_reward(episodes, model)  # type: ignore
    binary_avg_rewards = get_avg_reward(episodes, model, binary=True)  # type: ignore
    expanded_performance_data[model] = avg_rewards
    expanded_ratio_data[model] = binary_avg_rewards

In [None]:
def generate_latex_table(performance, models, wrap=True):
    # Define the table dimensions and labels
    dimensions = {
        "targeted_safety_risks": "Targeted Safety Risks & [-10, 0]",
        "system_and_operational_risks": "System and Operational Risks & [-10, 0]",
        "content_safety_risks": "Content Safety Risks & [-10, 0]",
        "societal_risks": "Societal Risks & [-10, 0]",
        "legal_and_rights_related_risks": "Legal and Rights-Related Risks & [-10, 0]",
        "efficiency": "Efficiency & [0, 10]",
        "goal": "Goal Completion & [0, 10]",
        "overall_score": "Overall Score & [0, 10]",
    }
    # Start building the LaTeX code
    if wrap:
        latex_code = """
\\begin{wraptable}[13]{r}{8.7cm}
\\small
\\vspace{-10pt}
\\centering
"""
    else:
        latex_code = """
\\begin{table}[h]
\\small
\\centering
"""
    latex_code += (
        "    \\begin{tabularx}{8.7cm}{@{\\hspace{10pt}}"
        + "r" * (len(dimensions) + 1)
        + "@{\\hspace{6pt}}}\n"
    )
    latex_code += "    \\toprule\n"
    latex_code += "         Model "

    # Add dimension headers to the table
    for dim_label in dimensions.values():
        latex_code += f"& {dim_label} "
    latex_code += "\\\\ \\midrule\n"

    # Populate the table with data
    for model in models:
        latex_code += f"         {models_mapping[model]} "
        for dim_key in dimensions.keys():
            if model in performance and dim_key in performance[model]:
                value = performance[model][dim_key][0]
                latex_code += f"& {value:.2f} "
            else:
                latex_code += "& "
        latex_code += "\\\\ \n"

    # Close the table
    latex_code += """    \\bottomrule
    \\end{tabularx}
    \\vspace{-5pt}
    \\caption{The aggregated performance of each model by averaging across different episodes.}
    \\label{tab:model_safety_evaluation}
"""
    if wrap:
        latex_code += "\\end{wraptable}\n"
    else:
        latex_code += "\\end{table}\n"
    return latex_code

In [None]:
latex_code = generate_latex_table(
    performance=expanded_ratio_data,
    models=models + expanded_models,
    wrap=False,
)
print(latex_code)

### Risk ratio graph

In [None]:
from typing import Dict, Tuple

import matplotlib.pyplot as plt
import seaborn as sns


def generate_bar_graph(
    ratio_data: Dict[str, Dict[str, Tuple[float, float]]],
    width: int = 12,
    save_path: str = "",
    overall_score_only: bool = False,
    bar_width: float = 0.2,
) -> None:
    # Define the categories to include in the graph
    categories = [
        "overall_score",
        "targeted_safety_risks",
        "system_and_operational_risks",
        "content_safety_risks",
        "societal_risks",
        "legal_and_rights_related_risks",
    ]
    category_acronyms = {
        "overall_score": "Overall",
        "targeted_safety_risks": "Targ",
        "system_and_operational_risks": "Syst",
        "content_safety_risks": "Cont",
        "societal_risks": "Soc",
        "legal_and_rights_related_risks": "Legal",
    }
    if overall_score_only:
        categories = ["overall_score"]
        category_acronyms = {"overall_score": "Overall"}
    # Initialize data containers for plotting
    models = list(ratio_data.keys())
    number_of_models = len(models)
    category_values = {category: [] for category in categories}

    # Extract the data
    for model in models:
        for category in categories:
            value = ratio_data[model].get(category, (0.0, 0.0))[0]
            category_values[category].append(value)

    # Create a new figure
    fig, ax = plt.subplots(figsize=(48, 20))  # Smaller figure size

    # Define color palette for gpt models (greens) and llama models (blues)
    gpt_palette = sns.color_palette(["#9ad3c4", "#63bba4"])
    llama_palette = sns.color_palette(["#add7ff", "#0081fb", "#0081fb"])
    # defining the attributes

    # Plot bars for each model
    index = range(len(categories))  # X locations for the groups

    spacing_factor = 1  # Increase the distance between each category
    gpt_index = 0
    llama_index = 0
    for i, model in enumerate(models):
        if "gpt" in model.lower():
            color = gpt_palette[gpt_index]
            gpt_index += 1
        else:
            color = llama_palette[llama_index]
            llama_index += 1

        bars = ax.bar(
            [p * spacing_factor + i * bar_width for p in index],
            [category_values[cat][i] for cat in categories],
            bar_width,
            label=models_mapping[model],
            color=color,
        )
        # Add numbers on top of bars
        for bar in bars:
            yval = bar.get_height()
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                yval + 0.01,
                f"{yval:.2f}",
                ha="center",
                va="bottom",
                fontsize=40,
            )
    # Insert an image into the graph
    from matplotlib.offsetbox import AnnotationBbox, OffsetImage

    # Load the image with higher resolution
    image_path = "table_haico.png"
    image = plt.imread(image_path)
    imagebox = OffsetImage(
        image, zoom=0.36
    )  # Adjust the zoom level for higher resolution

    # Position the image in the graph
    ab = AnnotationBbox(
        imagebox,
        (0.58, 0.85),
        frameon=False,
        xycoords="axes fraction",
        boxcoords="axes fraction",
        pad=0.1,
    )
    ax.add_artist(ab)

    # Labels, title, and legend
    ax.set_yticklabels([])  # Remove the numbers on the y axis
    ax.set_ylabel("Risk Ratio", fontsize=48, fontweight="bold")
    ax.set_xticks(
        [p * spacing_factor + (bar_width * (len(models) - 1) / 2) for p in index]
    )
    ax.set_xticklabels([category_acronyms[cat] for cat in categories], fontsize=48)
    ax.legend(
        title="Models",
        title_fontsize="50",
        fontsize="44",
        loc="best",
        frameon=True,
        fancybox=True,
        shadow=True,
    )
    sns.despine()
    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.tight_layout()
    plt.show()

In [None]:
generate_bar_graph(
    ratio_data=ratio_data,
    save_path="/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/risk_ratio_graph.pdf",
)

### Comparison across models

In [None]:
from collections import defaultdict

from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile


def compare_across_models(tags: list[str]) -> list[list[str]]:
    assert len(tags) == 2, "The function only supports two tags for comparison."
    # Get the performance data for each tag
    episodes_with_tag1 = EpisodeLog.find(EpisodeLog.tag == tags[0]).all()
    episodes_with_tag2 = EpisodeLog.find(EpisodeLog.tag == tags[1]).all()
    env_agent_combo = defaultdict(list)
    for episode in episodes_with_tag1:
        key_string = ";".join(
            [episode.environment, episode.agents[0], episode.agents[1]]
        )
        env_agent_combo[key_string].append(episode)
    for episode in episodes_with_tag2:
        key_string = ";".join(
            [episode.environment, episode.agents[0], episode.agents[1]]
        )
        env_agent_combo[key_string].append(episode)

    # Compare the performance of the two tags
    comparison_results = []
    for key, episodes in env_agent_combo.items():
        environment = HaiEnvironmentProfile.get(key.split(";")[0])
        codename = environment.codename
        if len(episodes) == 2:
            episode1, episode2 = episodes
            performance1 = episode1.rewards[1][1]["targeted_safety_risks"]
            performance2 = episode2.rewards[1][1]["targeted_safety_risks"]
            comparison_results.append([codename, performance1, performance2])
    return comparison_results

In [None]:
import rich

comparison = compare_across_models(
    tags=[
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    ]
)
rich.print(comparison)

### safty and goal

In [None]:
episodes_list = [
    EpisodeLog.find(EpisodeLog.tag == tags[2]).all(),
    EpisodeLog.find(EpisodeLog.tag == tags[3]).all(),
]

In [None]:
## a graph group safety and goal as x and y aixs aross different scenarios
from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile


def plot_safety_goal_scatter(
    episodes_list: list[list[EpisodeLog]],
    save_path: str = None,
    if_show_codename: bool = False,
    show_intent: bool = False,
    consider_difficulty: bool = False,
    do_regression: bool = False,
    do_zoom_seperate: bool = False,
) -> None:
    # Extract safety and goal scores from episodes
    safety_scores = {}
    goal_scores = {}
    env_codenames = {}
    user_intent = {}
    models = []
    avg_safety_scores = {}
    avg_goal_scores_list = {}
    for index, episodes in enumerate(episodes_list):
        for episode in episodes:
            env = episode.environment
            model = episode.models[2]
            key = (model, env)
            if key not in safety_scores:
                safety_scores[key] = []
                goal_scores[key] = []
            safety_scores[key].append(episode.rewards[1][1]["targeted_safety_risks"])
            goal_scores[key].append(episode.rewards[1][1]["goal"])
            models.append(model)
            if index == 0:
                env_profile = HaiEnvironmentProfile.get(env)
                env_codenames[env] = env_profile.codename
                user_intent[env] = env_profile.agent_intent_labels[0]

    avg_safety_scores = {
        key: sum(scores) / len(scores) for key, scores in safety_scores.items()
    }
    avg_goal_scores = {
        key: sum(scores) / len(scores) for key, scores in goal_scores.items()
    }
    # Count occurrences of (safety_score, goal_score) pairs
    score_pairs = list(zip(avg_safety_scores.values(), avg_goal_scores.values()))
    pair_counts = Counter(score_pairs)

    # Create a scatter plot using seaborn
    fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
    models_set = set(models)
    print(models_set)
    assert len(models_set) == 2, "The function only supports two models for comparison."
    models_list = list(models_set)
    if show_intent:
        model_colors = {
            models_list[0]: {"benign": "#1f77b4", "malicious": "#d62728"},  # blue, red
            models_list[1]: {
                "benign": "#2ca02c",
                "malicious": "#ff7f0e",
            },  # green, orange
            # Add more models and their corresponding colors as needed
        }
    else:
        model_colors = {
            models_list[0]: {"benign": "#339af0", "malicious": "#339af0"},  # blue, red
            models_list[1]: {
                "benign": "#22b8cf",
                "malicious": "#22b8cf",
            },  # green, orange
            # Add more models and their corresponding colors as needed
        }
    if consider_difficulty:
        env_differences = {}
        for env in env_codenames.keys():
            model_scores = [
                (model, avg_safety_scores[(model, env)], avg_goal_scores[(model, env)])
                for model in models_list
                if (model, env) in avg_safety_scores and (model, env) in avg_goal_scores
            ]
            if len(model_scores) == 2:
                model1, safety1, goal1 = model_scores[0]
                model2, safety2, goal2 = model_scores[1]
                safety_diff = abs(safety1 - safety2)
                goal_diff = abs(goal1 - goal2)
                env_differences[env] = (safety_diff < 2) and (goal_diff < 2)
    data = []
    for key, (safety, goal) in zip(avg_safety_scores.keys(), score_pairs):
        model = key[
            0
        ]  # Assuming the model is the same for all episodes in this context
        color = model_colors[model][user_intent[key[1]]]
        if consider_difficulty and env_differences[key[1]]:
            color = "#8879de"
        data.append(
            {
                "model": model,
                "safety": safety,
                "goal": goal,
                "size": pair_counts[(safety, goal)],
                "color": color,
                "intent": user_intent[key[1]] if show_intent else "",
                "codename": env_codenames[key[1]] if if_show_codename else "",
            }
        )

    df = pd.DataFrame(data)
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="whitegrid", rc=custom_params)

    for ax, model in zip(axes, models_list):
        model_df = df[df["model"] == model]
        scatter_plot = sns.scatterplot(
            ax=ax,
            data=model_df,
            x="safety",
            y="goal",
            size="size",
            sizes=(200, 800),
            hue="color",
            palette=model_df["color"].unique(),
            alpha=0.5,
            edgecolor="w",
            linewidth=0.5,
            legend=False,
        )

        # Perform regression analysis
        if show_intent:
            # do intent wise regression
            for intent in model_colors[model]:
                intent_df = model_df[model_df["intent"] == intent]
                reg_plot = sns.regplot(
                    ax=ax,
                    data=intent_df,
                    x="safety",
                    y="goal",
                    scatter=False,
                    color=model_colors[model][intent],
                    line_kws={"linewidth": 1, "alpha": 0.7},
                )
                correlation = intent_df["safety"].corr(intent_df["goal"])
                if intent == "malicious":
                    xy_position = (0.95, 0.05)
                elif intent == "benign":
                    xy_position = (0.95, 0.15)
                ax.annotate(
                    f"{intent} Correlation: {correlation:.2f}",
                    xy=xy_position,
                    xycoords="axes fraction",
                    fontsize=12,
                    ha="right",
                    va="bottom",
                    color=model_colors[model][intent],
                )
        else:
            if do_regression:
                reg_plot = sns.regplot(
                    ax=ax,
                    data=model_df,
                    x="safety",
                    y="goal",
                    scatter=False,
                    color="purple",
                    line_kws={"linewidth": 1, "alpha": 0.7},
                )

                # Calculate and show correlation number
                correlation = model_df["safety"].corr(model_df["goal"])
                ax.annotate(
                    f"Correlation: {correlation:.2f}",
                    xy=(0.95, 0.05),
                    xycoords="axes fraction",
                    fontsize=12,
                    ha="right",
                    va="bottom",
                    color="purple",
                )
            if do_zoom_seperate:
                ax.axvline(x=-4, color="green", linestyle="--", linewidth=1)
                ax.axhline(y=6, color="purple", linestyle="--", linewidth=1)
                ax.axvspan(-4, ax.get_xlim()[1], color="green", alpha=0.1)
                ax.axhspan(6, ax.get_ylim()[1], color="purple", alpha=0.1)

        # Add diagonal line
        ax.plot([-10, 0], [0, 10], ls="--", c=".3", linewidth=3, alpha=0.3)

        if if_show_codename:
            for _, row in model_df.iterrows():
                ax.text(
                    row["safety"], row["goal"], row["codename"], fontsize=9, ha="right"
                )

    plt.subplots_adjust(
        wspace=0.1
    )  # Adjust the width space between subplots to make them more compact

    # Add legend for each color
    from matplotlib.lines import Line2D

    for ax, model in zip(axes, models_list):
        legend_elements = []
        for intent, color in model_colors[model].items():
            if show_intent:  # Only add legend element if intent is not empty
                legend_elements.append(
                    Line2D(
                        [0],
                        [0],
                        marker="o",
                        color="w",
                        label=f"{models_mapping[model]} - {intent}",
                        markerfacecolor=color,
                        markersize=10,
                    )
                )
        if not show_intent:
            legend_elements.append(
                Line2D(
                    [0],
                    [0],
                    marker="o",
                    color="w",
                    label=f"{models_mapping[model]}",
                    markerfacecolor=color,
                    markersize=10,
                )
            )
        # Add customized x and y labels
        ax.set_xlabel("Targeted Safety Risk Score")
        ax.set_ylabel("Goal Completion Score")

        # increase the font size of the axis scale
        ax.tick_params(axis="both", labelsize=14)
        # increase the font size of the axis label
        ax.xaxis.label.set_size(14)
        ax.yaxis.label.set_size(14)
        if show_intent:
            ax.legend(handles=legend_elements, title="Model - Intent", loc="upper left")
        else:
            ax.legend(handles=legend_elements, title="Model", loc="upper left")

    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.show()


plot_safety_goal_scatter(
    episodes_list,
    save_path="/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/safety_goal_scatter_llama.pdf",
    if_show_codename=False,
    consider_difficulty=False,
    show_intent=False,
    do_regression=False,
    do_zoom_seperate=True,
)

# Sec 6.2: human intents

### Get the data

In [None]:
# Obtain the data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward


def calculate_model_rewards(models, tags, remove_tools):
    model_rewards = {}
    for model, tag in zip(models, tags):
        episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
        benign_intent_episodes = []
        malicious_intent_episodes = []
        benign_intent_episodes_wo_tools = []
        malicious_intent_episodes_wo_tools = []
        for episode in episodes:
            env = HaiEnvironmentProfile.get(episode.environment)
            tools_or_not = len(env.toolkits) > 0
            if remove_tools:
                if not tools_or_not:
                    if env.agent_intent_labels[0] == "benign":
                        benign_intent_episodes_wo_tools.append(episode)
                    else:
                        malicious_intent_episodes_wo_tools.append(episode)
                else:
                    if env.agent_intent_labels[0] == "benign":
                        benign_intent_episodes.append(episode)
                    else:
                        malicious_intent_episodes.append(episode)
            else:
                if env.agent_intent_labels[0] == "benign":
                    benign_intent_episodes.append(episode)
                else:
                    malicious_intent_episodes.append(episode)
        try:
            benign_binary_avg_rewards_wo_tools = get_avg_reward(
                benign_intent_episodes_wo_tools, model, binary=True
            )  # type: ignore
        except Exception as e:
            benign_binary_avg_rewards_wo_tools = {}

        malicious_binary_avg_rewards_wo_tools = get_avg_reward(
            malicious_intent_episodes_wo_tools, model, binary=True
        )  # type: ignore
        benign_binary_avg_rewards = get_avg_reward(
            benign_intent_episodes, model, binary=True
        )  # type: ignore
        malicious_binary_avg_rewards = get_avg_reward(
            malicious_intent_episodes, model, binary=True
        )  # type: ignore
        print(benign_binary_avg_rewards_wo_tools)
        model_rewards[model] = (
            benign_binary_avg_rewards,
            malicious_binary_avg_rewards,
            malicious_binary_avg_rewards_wo_tools,
        )
    return model_rewards


model_rewards = calculate_model_rewards(models, tags, remove_tools=True)

### Get the plot malicious vs benign

In [None]:
from typing import Dict, Tuple

import matplotlib.pyplot as plt
import seaborn as sns


def draw_overall_score_bar_plot(
    data: Dict[
        str,
        Tuple[
            Dict[str, Tuple[float, float]],
            Dict[str, Tuple[float, float]],
            Dict[str, Tuple[float, float]],
        ],
    ],
    save_path: str,
) -> None:
    # Prepare data for plotting
    models = []
    scores = []
    intents = []

    for model_name, (benign, malicious, malicious_wo_tools) in data.items():
        models.append(models_mapping[model_name])
        scores.append(benign["overall_score"][0])
        intents.append("Benign (w/ tools)")

        models.append(models_mapping[model_name])
        scores.append(malicious["overall_score"][0])
        intents.append("Malicious (w/ tools)")

        models.append(models_mapping[model_name])
        scores.append(malicious_wo_tools["overall_score"][0])
        intents.append("Malicious (w/o tools)")

    # Create a DataFrame for easier plotting
    plot_data = {"Model": models, "Overall Score": scores, "Intent": intents}
    # Set up the color palette
    palette = {
        "Benign (w/ tools)": "#20c997",
        "Malicious (w/ tools)": "#aca2e8",
        "Malicious (w/o tools)": "#8879de",
    }
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="whitegrid", rc=custom_params)
    # Plot the data
    plt.figure(figsize=(6, 4))
    ax = sns.barplot(
        x="Model", y="Overall Score", hue="Intent", data=plot_data, palette=palette
    )

    # Adding labels and title
    plt.xlabel("")
    plt.ylabel("Overall Risk Ratio")
    plt.ylim(0, 1)

    for p in ax.patches:
        height = p.get_height()
        if height > 0:  # Only annotate bars with a positive height
            ax.annotate(
                f"{height:.2f}",
                (p.get_x() + p.get_width() / 2.0, height),
                ha="center",
                va="bottom",  # Adjust the vertical alignment to be 'bottom'
                xytext=(0, 8),
                textcoords="offset points",
                fontsize=10,
                color="black",
            )
    # Position the legend to upper left
    plt.legend(title="Intent", title_fontsize="10", fontsize="8", loc="upper left")

    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.legend(title="Intent", title_fontsize="10", fontsize="8")

    # Improve layout
    plt.tight_layout()

    # Show the plot
    plt.show()


draw_overall_score_bar_plot(
    model_rewards,
    save_path="/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/human_intent_plot.pdf",
)

### malicious intent inferring ability

In [None]:
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

model_rewards = {}
for model, tag in zip(models, tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    episodes_with_tools = []
    episodes_wo_tools = []
    for episode in episodes:
        env = HaiEnvironmentProfile.get(episode.environment)
        tools_or_not = len(env.toolkits) > 0
        if tools_or_not:
            episodes_with_tools.append(episode)
        else:
            episodes_wo_tools.append(episode)
    print(
        f"the number of the datapoints for goal and risk for each model: {len(episodes_wo_tools)}"
    )
    try:
        avg_rewards_wo_tools = get_avg_reward(episodes_wo_tools, model)  # type: ignore
    except Exception as e:
        avg_rewards_wo_tools = {}
    avg_rewards_with_tools = get_avg_reward(episodes_with_tools, model)  # type: ignore
    model_rewards[model] = (avg_rewards_with_tools, avg_rewards_wo_tools)

In [None]:
from typing import Dict

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="whitegrid", rc=custom_params)


def draw_malicious_intent_bar_plot(
    data: dict[
        str, tuple[dict[str, tuple[float, float]], dict[str, tuple[float, float]]]
    ],
    save_path: str,
) -> None:
    # Prepare data for plotting
    models = []
    goal_scores = []
    risk_scores = []
    risk_scores_wo_tools = []
    goal_scores_wo_tools = []
    # the second element of the tuple is the malicious intent
    for model_name, (benign, malicious) in data.items():
        models.append(models_mapping[model_name])
        risk_scores.append(benign["targeted_safety_risks"][0])
        goal_scores.append(benign["efficiency"][0])
        risk_scores_wo_tools.append(malicious["targeted_safety_risks"][0])
        goal_scores_wo_tools.append(0.0)
    plot_data = {
        "Model": models,
        "Efficiency": goal_scores,
        "Risk": risk_scores,
        "Risk (wo tools)": risk_scores_wo_tools,
    }
    custom_palette = {
        "Efficiency": "#63e6be",
        "Risk": "#ff6b6b",
        "Risk (wo tools)": "orange",
    }
    plot_data_df = pd.DataFrame(plot_data)

    # Plot Efficiency and Risk as stacked bars
    fig, ax = plt.subplots(figsize=(6, 4))
    bar_offset = 0.5  # Adjust this value to control the gap between models
    bar_width = 0.25
    # Plot Efficiency and Risk as stacked bars
    plot_data_df.plot(
        x="Model",
        y=["Efficiency", "Risk"],
        kind="bar",
        stacked=True,
        color=[custom_palette["Efficiency"], custom_palette["Risk"]],
        width=bar_width,  # Increase the width of the bars to reduce the gap between models
        ax=ax,
        position=1 - bar_offset,
    )

    # Plot Risk (wo tools) as a separate bar
    plot_data_df.plot(
        x="Model",
        y="Risk (wo tools)",
        kind="bar",
        color=custom_palette["Risk (wo tools)"],
        ax=ax,
        width=bar_width,  # Increase the width of the bars to reduce the gap between models
        position=1 + bar_offset,  # Align the position to overlap the bars
    )

    ax.set_xticklabels(
        ax.get_xticklabels(), rotation=0
    )  # Set x-axis labels to horizontal

    ax.set_xlabel("")  # Remove the x-axis label
    ax.legend(fontsize="x-small")  # Set smaller legend
    # Add numbers on each bar, excluding 0.0
    for p in ax.patches:
        height = p.get_height()
        if height != 0.0:  # Only annotate if height is not 0.0
            ax.annotate(
                format(height, ".2f"),
                (p.get_x() + p.get_width() / 2.0, height),
                ha="center",
                va="center",
                fontsize="x-small",  # Make the text smaller
                xytext=(0, 9 if height > 0 else -9),
                textcoords="offset points",
            )
    ax.set_ylim(bottom=-8)  # Increase the y limit to -9
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.show()


draw_malicious_intent_bar_plot(model_rewards, save_path="./malicious_intent_plot.pdf")

# Sec 6.3: Access to the tools


In [None]:
from typing import Dict

import matplotlib.pyplot as plt
import seaborn as sns


def draw_efficiency_goal_bar_plot(
    data: Dict[str, Dict[str, Tuple[float, float]]], save_path: str = None
) -> None:
    # Prepare data for plotting
    models = []
    scores = []
    metrics = []

    for model_name, metrics_dict in data.items():
        models.append(models_mapping[model_name])
        scores.append(
            metrics_dict["efficiency"][0]
        )  # Use the first element of the tuple for efficiency
        metrics.append("Efficiency")

        models.append(models_mapping[model_name])
        scores.append(
            metrics_dict["goal"][0]
        )  # Use the first element of the tuple for goal
        metrics.append("Goal")

        models.append(models_mapping[model_name])
        scores.append(
            metrics_dict["targeted_safety_risks"][0]
        )  # Use the first element of the tuple for efficiency
        metrics.append("targeted_safety_risks")

    # Create a DataFrame for easier plotting
    plot_data = {"Model": models, "Score": scores, "Metric": metrics}

    #    "Benign": "#20c997",
    #     "Malicious (w/ tools)": "#aca2e8",
    #     "Malicious (w/o tools)": "#8879de",
    # Set up the color palette
    palette = {
        "Efficiency": "#20c997",
        "Goal": "#4dabf7",
        "targeted_safety_risks": "#aca2e8",
    }

    # Plot the data
    plt.figure(figsize=(6, 4))
    ax = sns.barplot(
        x="Model", y="Score", hue="Metric", data=plot_data, palette=palette
    )

    # Adding labels and title
    plt.xlabel("")
    plt.ylabel("Score")
    plt.ylim(min(scores) - 1, max(scores) + 1)

    # Add the scores above the bars
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(
            f"{height:.2f}".rstrip("0").rstrip(".").rstrip("0"),
            (p.get_x() + p.get_width() / 2.0, height),
            ha="center",
            va="bottom",  # Adjust the vertical alignment to be 'bottom'
            xytext=(0, 8),
            textcoords="offset points",
            fontsize=10,
            color="black",
        )

    # Improve visual spacing
    handles, labels = ax.get_legend_handles_labels()
    labels = [
        "Efficiency" if label == "Efficiency" else "Goal" if label == "Goal" else "Targ"
        for label in labels
    ]
    plt.legend(handles, labels, title="Metric", title_fontsize="10", fontsize="8")
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")

    # Improve layout
    plt.tight_layout()

    # Show the plot
    plt.show()


draw_efficiency_goal_bar_plot(
    performance_data,
    save_path="/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/access_to_tools_plot.pdf",
)

### Correlation between efficiency and safety risks


In [None]:
# Sec 6.4: Correlation between efficiency and safety risks

from typing import Dict

import numpy as np
import scipy.stats as stats


def calculate_correlation(data: Dict[str, Dict[str, Tuple[float, float]]]) -> float:
    # Extract efficiency and safety risks scores
    efficiency_scores = [
        metrics_dict["efficiency"][0] for metrics_dict in data.values()
    ]
    print(efficiency_scores)
    safety_risks_scores = [
        metrics_dict["targeted_safety_risks"][0] for metrics_dict in data.values()
    ]
    print(safety_risks_scores)

    # Calculate the correlation coefficient
    correlation_coefficient, _ = stats.pearsonr(efficiency_scores, safety_risks_scores)

    return correlation_coefficient


# Calculate the correlation coefficient
correlation_coefficient = calculate_correlation(expanded_performance_data)

print(
    f"The correlation coefficient between efficiency and safety risks is: {correlation_coefficient:.4f}"
)

In [None]:
## correlation with all episodes
def calculate_correlation(episodes: list[EpisodeLog]) -> float:
    # Extract efficiency and safety risks scores
    efficiency_scores = [episode.rewards[1][1]["efficiency"] for episode in episodes]
    safety_risks_scores = [
        episode.rewards[1][1]["targeted_safety_risks"] for episode in episodes
    ]

    # Calculate the correlation coefficient
    correlation_coefficient, _ = stats.pearsonr(efficiency_scores, safety_risks_scores)

    return correlation_coefficient


correlation_coefficient = calculate_correlation(all_episodes)
print(
    f"The correlation coefficient between efficiency and safety risks is: {correlation_coefficient:.4f}"
)

# Sec 6.4 Single turn vs multi-turn interactions

In [None]:
# get performance and ratio data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

filter_performance_data = {}
filter_ratio_data = {}
filter_all_episodes = []
filter_key_words = ["jailbreak_dan", "jailbreak_persuasive", "jailbreak_wildteaming"]
featured_models = [models[0], models[1], models[2], models[3]]
featured_tags = [
    (
        tags[0],
        "benchmark_gpt-4-turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_wo_interaction_2",
    ),
    (
        tags[1],
        "benchmark_gpt-3.5-turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_wo_interaction_2",
    ),
    (
        tags[2],
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_wo_interaction_2",
    ),
    (
        tags[3],
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_wo_interaction_2",
    ),
]

# get filtered env list
filtered_dict = {filter_key_word: [] for filter_key_word in filter_key_words}
enviroment_profiles = HaiEnvironmentProfile.find().all()
for env in enviroment_profiles:
    assert isinstance(env, HaiEnvironmentProfile)
    env_name = env.codename
    for filter_key_word in filter_key_words:
        if filter_key_word in env_name:
            filtered_dict[filter_key_word].append(env.pk)

for model, featured_tag in zip(featured_models, featured_tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == featured_tag[0]).all()
    episodes_wo_interaction = EpisodeLog.find(EpisodeLog.tag == featured_tag[1]).all()
    print(f"fetching {len(episodes)} episodes for {model}")
    print(
        f"fetching {len(episodes_wo_interaction)} episodes without interaction for {model}"
    )
    for filter_key_word in filter_key_words:
        filtered_episodes = [
            episode
            for episode in episodes
            if episode.environment in filtered_dict[filter_key_word]
        ]
        filtered_episodes_wo_interaction = [
            episode
            for episode in episodes_wo_interaction
            if episode.environment in filtered_dict[filter_key_word]
        ]
        avg_rewards = get_avg_reward(filtered_episodes, model)  # type: ignore
        binary_avg_rewards = get_avg_reward(filtered_episodes, model, binary=True)  # type: ignore
        avg_rewards_wo_interaction = get_avg_reward(
            filtered_episodes_wo_interaction, model
        )  # type: ignore
        binary_avg_rewards_wo_interaction = get_avg_reward(
            filtered_episodes_wo_interaction, model, binary=True
        )  # type: ignore
        filter_performance_data[(model, filter_key_word)] = (
            avg_rewards,
            avg_rewards_wo_interaction,
        )
        filter_ratio_data[(model, filter_key_word)] = (
            binary_avg_rewards,
            binary_avg_rewards_wo_interaction,
        )

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


def plot_data(filter_ratio_data, include_performance_data=False):
    # Prepare data for plotting
    plot_data = []
    for (model, filter_key_word), ratio_data_tuple in filter_ratio_data.items():
        plot_data.append(
            {
                "Model": model,
                "Filter Key Word": filter_key_word,
                "Interaction Type": "Multi-turn Interaction",
                "Overall Risk": ratio_data_tuple[0]["overall_score"][0],
                "Variance": ratio_data_tuple[0]["overall_score"][1],
            }
        )
        plot_data.append(
            {
                "Model": model,
                "Filter Key Word": filter_key_word,
                "Interaction Type": "Single-turn Interaction",
                "Overall Risk": ratio_data_tuple[1]["overall_score"][0],
                "Variance": ratio_data_tuple[1]["overall_score"][1],
            }
        )
    # Convert to DataFrame
    df = pd.DataFrame(plot_data)

    # Map filter key words to new labels
    filter_key_word_mapping = {
        "jailbreak_dan": "DAN",
        "jailbreak_persuasive": "PAP",
        "jailbreak_wildteaming": "WildTeaming",
    }
    df["Filter Key Word"] = df["Filter Key Word"].map(filter_key_word_mapping)
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="whitegrid", rc=custom_params)
    # Draw bar plots for Overall Risk for each model considering different interaction types
    for model in df["Model"].unique():
        plt.figure(figsize=(6, 4))
        ax = sns.barplot(
            x="Filter Key Word",
            y="Overall Risk",
            hue="Interaction Type",
            data=df[df["Model"] == model],
            palette={
                "Multi-turn Interaction": "#aca2e8",
                "Single-turn Interaction": "#20c997",
            },
        )
        plt.ylabel("Overall Risk Ratio", fontsize=10)
        plt.xlabel("", fontsize=10)
        plt.legend(title="Interaction Type", fontsize=8, title_fontsize=10)
        # ax.tick_params(axis='both', which='major', labelsize=14)

        # # Add error bars with variance data
        # for i, bar in enumerate(ax.patches):
        #     height = bar.get_height()
        #     if height != 0.0:  # Skip bars with height 0.0
        #         variance = df.iloc[i // 2]["Variance"]
        #         ax.errorbar(bar.get_x() + bar.get_width() / 2, height, yerr=variance, fmt='none', c='#2e6694', capsize=5, elinewidth=3, alpha=0.5)

        # Add numbers on top of the bars
        for p in ax.patches:
            height = p.get_height()
            if height != 0.0:  # Skip bars with height 0.0
                ax.annotate(
                    format(height, ".2f"),
                    (p.get_x() + p.get_width() / 2.0, height),
                    ha="center",
                    va="center",
                    xytext=(0, 9),
                    textcoords="offset points",
                )

        break

    # draw bar plot for average risk across all models
    # average_df = df.groupby(["Filter Key Word", "Interaction Type"])["Overall Risk"].mean().reset_index()
    # plt.figure(figsize=(10, 8))
    # sns.barplot(x="Filter Key Word", y="Overall Risk", hue="Interaction Type", data=average_df)
    # plt.title("Overall Risk by Filter Key Word and Interaction Type")
    # plt.ylabel("Overall Risk Ratio")
    # plt.xlabel("")
    # plt.legend(title="Interaction Type")
    # plt.show()
    # save the plot
    save_path = "/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/interaction_type_plot.pdf"
    plt.savefig(save_path, format="pdf", bbox_inches="tight")


plt.show()
# Example usage:
plot_data(filter_ratio_data, include_performance_data=False)

# Appendix: 

In [None]:
# draw a pie chart for the distribution of the scenarios domain of the scenarios; use seaborn
from haicosystem.protocols import HaiEnvironmentProfile

enviroment_profiles = HaiEnvironmentProfile.find().all()
domains = []
intent = []
realism_level = []

for env in enviroment_profiles:
    assert isinstance(env, HaiEnvironmentProfile)
    domains.append(env.domain)
    intent.append(env.agent_intent_labels[0])
    realism_level.append(env.realism)

# draw a pie chart for the distribution of the scenarios domain of the scenarios
import matplotlib.pyplot as plt
import seaborn as sns

# Count the occurrences of each domain
domain_counts = {domain: domains.count(domain) for domain in set(domains)}

# Create a pie chart
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
domain_counts_series = pd.Series(domain_counts)
colors = sns.color_palette("pastel6")
domain_counts_series.plot(
    kind="pie",
    autopct="%1.1f%%",
    startangle=140,
    textprops={"size": "larger"},
    colors=colors,
)
plt.title("Distribution of Scenario Domains")
plt.ylabel("")  # Hide the y-label

# Save the pie chart to a file
plt.savefig(
    "/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/scenario_domain_pie_chart.pdf",
    format="pdf",
    bbox_inches="tight",
)
plt.show()

In [None]:
# bar plot for intent
intent_counts = {i: intent.count(i) for i in set(intent)}
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
intent_counts_series = pd.Series(intent_counts)
colors = sns.color_palette("pastel6")
intent_counts_series.plot(kind="bar", color=colors)
plt.title("Distribution of Scenario Intent")
plt.ylabel("Count")
plt.xlabel("Intent")
plt.xticks(rotation=0)  # Ensure x ticks are flat
plt.savefig(
    "/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/scenario_intent_bar_plot.pdf",
    format="pdf",
    bbox_inches="tight",
)
plt.show()
# bar plot for realism level
realism_counts = {realism: realism.count(realism) for realism in set(realism_level)}
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
realism_counts_series = pd.Series(realism_counts)

In [None]:
# bar plot for realism level
realism_level = [i for i in realism_level if len(i) < 20]
realism_counts = {i: realism_level.count(i) for i in set(realism_level)}
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
realism_counts_series = pd.Series(realism_counts)
realism_counts_series.plot(kind="bar", color=colors)
plt.title("Distribution of Scenario Realism Level")
plt.ylabel("Count")
plt.xlabel("Realism Level")
plt.xticks(rotation=0)  # Ensure x ticks are flat
plt.savefig(
    "/Users/xuhuizhou/Projects/papers/ICLR2025-HAICosystem/figures/scenario_realism_level_bar_plot.pdf",
    format="pdf",
    bbox_inches="tight",
)
plt.show()

### Rebuttal: Get breakdown stats for the realism level

In [None]:
# get performance and ratio data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

performance_data = {}
ratio_data = {}
model_rewards = {}
all_episodes = []
realism_level = []
for model, tag in zip(models, tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    print(f"model: {model}, tag: {tag}, episodes: {len(episodes)}")
    for episode in episodes:
        env = HaiEnvironmentProfile.get(episode.environment)
        realism_level.append(env.realism if "https" not in env.realism else "level 3")
    # group episodes by realism level
    grouped_episodes = {}
    for realism in set(realism_level):
        grouped_episodes[realism] = [
            episode
            for realism_level, episode in zip(realism_level, episodes)
            if realism == realism_level
        ]
    import pandas as pd

    df = pd.DataFrame(
        columns=["Realism Level", "Model", "Average Reward", "Binary Average Reward"]
    )

    for realism in set(realism_level):
        print(f"realism: {realism}, episodes: {len(grouped_episodes[realism])}")
        avg_reward = get_avg_reward(grouped_episodes[realism], model)  # type: ignore
        binary_avg_reward = get_avg_reward(
            grouped_episodes[realism], model, binary=True
        )  # type: ignore
        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    [
                        {
                            "Realism Level": realism,
                            "Model": model,
                            "Average Reward": avg_reward,
                            "Binary Average Reward": binary_avg_reward,
                        }
                    ]
                ),
            ],
            ignore_index=True,
        )

    performance_data[model] = df[["Realism Level", "Model", "Average Reward"]]
    ratio_data[model] = df[["Realism Level", "Model", "Binary Average Reward"]]

In [None]:
from copy import deepcopy

performance_data_copy = deepcopy(performance_data)

In [None]:
print(performance_data_copy["gpt-4-turbo"]["Average Reward"][0])

In [None]:
import pandas as pd


def format_performance_data(performance_data, ratio_data):
    formatted_data = {}
    for model, df in performance_data.items():
        df["Average Risk Score"] = df["Average Reward"].apply(
            lambda x: f"{((float(x['targeted_safety_risks'][0]+ x['system_and_operational_risks'][0])+ float(x['content_safety_risks'][0]) + float(x['societal_risks'][0]) + float(x['legal_and_rights_related_risks'][0]))/4):.2f}"
        )
        df = (
            df.set_index("Realism Level")
            .loc[["level 3", "level 2", "level 1"]]
            .reset_index()
        )
        df = df.drop(columns=["Model"])
        df = df.drop(columns=["Average Reward"])
        formatted_data[model] = df.to_markdown(index=False)

    return formatted_data


formatted_performance_data = format_performance_data(performance_data_copy)
for model, table in formatted_performance_data.items():
    print(f"## {model}\n")
    print(table)
    print("\n")

In [None]:
ratio_data_copy = deepcopy(ratio_data)


def format_ratio_data(ratio_data):
    formatted_data = {}
    for model, df in ratio_data.items():
        df["Risk Ratio"] = df["Binary Average Reward"].apply(
            lambda x: f"{float(x['overall_score'][0]):.2f}"
        )
        df = (
            df.set_index("Realism Level")
            .loc[["level 3", "level 2", "level 1"]]
            .reset_index()
        )
        df = df.drop(columns=["Model"])
        df = df.drop(columns=["Binary Average Reward"])
        formatted_data[model] = df.to_markdown(index=False)
    return formatted_data


formatted_ratio_data = format_ratio_data(ratio_data_copy)
for model, table in formatted_ratio_data.items():
    print(f"## {model}\n")
    print(table)
    print("\n")