# Data

In [None]:
models = [
    "gpt-4-turbo",
    "gpt-3.5-turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
]
tags = [
    "benchmark_gpt-4-turbo_gpt-4o_gpt-4o_haicosystem_trial2",
    "benchmark_gpt-3.5-turbo_gpt-4o_gpt-4o_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    "benchmark_together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
]

### Models mapping

In [None]:
models_mapping = {
    "gpt-4-turbo": "GPT-4-turbo",
    "gpt-3.5-turbo": "GPT-3.5-turbo",
    "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "Llama3-70B",
    "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": "Llama3-405B",
}

# Sec 6.1: main results

In [None]:
# get performance and ratio data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

performance_data = {}
ratio_data = {}
model_rewards = {}
for model, tag in zip(models, tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    avg_rewards = get_avg_reward(episodes, model)  # type: ignore
    binary_avg_rewards = get_avg_reward(episodes, model, binary=True)  # type: ignore
    performance_data[model] = avg_rewards
    ratio_data[model] = binary_avg_rewards

In [None]:
def generate_latex_table(performance, models, wrap=True):
    # Define the table dimensions and labels
    dimensions = {
        "targeted_safety_risks": "Targeted Safety Risks & [-10, 0]",
        "system_and_operational_risks": "System and Operational Risks & [-10, 0]",
        "content_safety_risks": "Content Safety Risks & [-10, 0]",
        "societal_risks": "Societal Risks & [-10, 0]",
        "legal_and_rights_related_risks": "Legal and Rights-Related Risks & [-10, 0]",
        "efficiency": "Efficiency & [0, 10]",
        "goal": "Goal Completion & [0, 10]",
    }
    # Start building the LaTeX code
    # Start building the LaTeX code
    if wrap:
        latex_code = """
\\begin{wraptable}[13]{r}{8.7cm}
\\small
\\vspace{-10pt}
\\centering
"""
    else:
        latex_code = """
\\begin{table}[h]
\\small
\\centering
"""
    latex_code += (
        "    \\begin{tabularx}{8.7cm}{@{\\hspace{10pt}}"
        + "r" * (len(models) + 2)
        + "@{\\hspace{6pt}}}\n"
    )
    latex_code += "    \\toprule\n"
    latex_code += "         Dimension & Range "

    # Add model headers to the table
    for model in models:
        latex_code += f"& {models_mapping[model]} "
    latex_code += "\\\\ \\midrule\n"

    # Populate the table with data
    for dim_key, dim_label in dimensions.items():
        latex_code += f"         {dim_label} "
        for model in models:
            if model in performance and dim_key in performance[model]:
                value = performance[model][dim_key][0]
                latex_code += f"& {value:.2f} "
            else:
                latex_code += "& "
        latex_code += "\\\\ \n"

    # Close the table
    latex_code += """    \\bottomrule
    \\end{tabularx}
    \\vspace{-5pt}
    \\caption{The aggregated performance of each model by averaging across different episodes.}
    \\label{tab:model_safety_evaluation}
"""
    if wrap:
        latex_code += "\\end{wraptable}\n"
    else:
        latex_code += "\\end{table}\n"
    return latex_code

In [None]:
latex_code = generate_latex_table(
    performance=performance_data,
    models=models,
    wrap=False,
)
print(latex_code)

### Risk ratio graph

In [None]:
from typing import Dict, Tuple

import matplotlib.pyplot as plt
import seaborn as sns


def generate_bar_graph(
    ratio_data: Dict[str, Dict[str, Tuple[float, float]]], save_path: str = ""
) -> None:
    # Define the categories to include in the graph
    categories = [
        "overall_score",
        "targeted_safety_risks",
        "system_and_operational_risks",
        "content_safety_risks",
        "societal_risks",
        "legal_and_rights_related_risks",
    ]
    category_acronyms = {
        "overall_score": "Overall",
        "targeted_safety_risks": "Tar.R",
        "system_and_operational_risks": "Sys.R",
        "content_safety_risks": "Con.R",
        "societal_risks": "Soc.R",
        "legal_and_rights_related_risks": "Legal.R",
    }

    # Initialize data containers for plotting
    models = list(ratio_data.keys())
    number_of_models = len(models)
    category_values = {category: [] for category in categories}

    # Extract the data
    for model in models:
        for category in categories:
            value = ratio_data[model].get(category, (0.0, 0.0))[0]
            category_values[category].append(value)

    # Set up Seaborn's style
    sns.set(style="whitegrid")

    # Create a new figure
    fig, ax = plt.subplots(figsize=(12, 5))  # Smaller figure size

    # Define color palette (blues)
    palette = sns.color_palette("Blues", len(models))

    # Plot bars for each model
    bar_width = 0.2  # Width of the bars
    index = range(len(categories))  # X locations for the groups

    for i, model in enumerate(models):
        bars = ax.bar(
            [p + i * bar_width for p in index],
            [category_values[cat][i] for cat in categories],
            bar_width,
            label=models_mapping[model],
            color=palette[i],
        )
        # Add numbers on top of bars
        for bar in bars:
            yval = bar.get_height()
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                yval + 0.01,
                f"{yval:.2f}",
                ha="center",
                va="bottom",
                fontsize=10,
            )

    # Labels, title, and legend
    ax.set_ylabel("Risk Ratio", fontsize=12, fontweight="bold")
    ax.set_xticks([p + (bar_width * (len(models) - 1) / 2) for p in index])
    ax.set_xticklabels([category_acronyms[cat] for cat in categories], fontsize=12)
    ax.legend(
        title="Models",
        title_fontsize="13",
        fontsize="11",
        loc="best",
        frameon=True,
        fancybox=True,
        shadow=True,
    )
    sns.despine()
    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.tight_layout()
    plt.show()

In [None]:
generate_bar_graph(ratio_data=ratio_data, save_path="./risk_ratio_graph.pdf")

### Comparison across models

In [None]:
from collections import defaultdict

from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile


def compare_across_models(tags: list[str]) -> list[list[str]]:
    assert len(tags) == 2, "The function only supports two tags for comparison."
    # Get the performance data for each tag
    episodes_with_tag1 = EpisodeLog.find(EpisodeLog.tag == tags[0]).all()
    episodes_with_tag2 = EpisodeLog.find(EpisodeLog.tag == tags[1]).all()
    env_agent_combo = defaultdict(list)
    for episode in episodes_with_tag1:
        key_string = ";".join(
            [episode.environment, episode.agents[0], episode.agents[1]]
        )
        env_agent_combo[key_string].append(episode)
    for episode in episodes_with_tag2:
        key_string = ";".join(
            [episode.environment, episode.agents[0], episode.agents[1]]
        )
        env_agent_combo[key_string].append(episode)

    # Compare the performance of the two tags
    comparison_results = []
    for key, episodes in env_agent_combo.items():
        environment = HaiEnvironmentProfile.get(key.split(";")[0])
        codename = environment.codename
        if len(episodes) == 2:
            episode1, episode2 = episodes
            performance1 = episode1.rewards[1][1]["targeted_safety_risks"]
            performance2 = episode2.rewards[1][1]["targeted_safety_risks"]
            comparison_results.append([codename, performance1, performance2])
    return comparison_results

In [None]:
import rich

comparison = compare_across_models(
    tags=[
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
        "benchmark_together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo_gpt-4o-2024-08-06_gpt-4o-2024-08-06_haicosystem_trial2",
    ]
)
rich.print(comparison)

# Sec 6.2: human intents

In [None]:
# Obtain the data
from sotopia.database import EpisodeLog

from haicosystem.protocols import HaiEnvironmentProfile
from haicosystem.utils import get_avg_reward

model_rewards = {}
for model, tag in zip(models, tags):
    episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()
    benign_intent_episodes = []
    malicious_intent_episodes = []
    for episode in episodes:
        env = HaiEnvironmentProfile.get(episode.environment)
        if env.agent_intent_labels[0] == "benign":
            benign_intent_episodes.append(episode)
        else:
            malicious_intent_episodes.append(episode)
    benign_binary_avg_rewards = get_avg_reward(
        benign_intent_episodes, model, binary=True
    )  # type: ignore
    malicious_binary_avg_rewards = get_avg_reward(
        malicious_intent_episodes, model, binary=True
    )  # type: ignore
    model_rewards[model] = (benign_binary_avg_rewards, malicious_binary_avg_rewards)

In [None]:
from typing import Dict, Tuple

import matplotlib.pyplot as plt
import seaborn as sns


def draw_overall_score_bar_plot(
    data: Dict[
        str, Tuple[Dict[str, Tuple[float, float]], Dict[str, Tuple[float, float]]]
    ],
    save_path: str,
) -> None:
    # Prepare data for plotting
    models = []
    scores = []
    intents = []

    for model_name, (benign, malicious) in data.items():
        models.append(models_mapping[model_name])
        scores.append(benign["overall_score"][0])
        intents.append("Benign")

        models.append(models_mapping[model_name])
        scores.append(malicious["overall_score"][0])
        intents.append("Malicious")

    # Create a DataFrame for easier plotting
    plot_data = {"Model": models, "Overall Score": scores, "Intent": intents}
    # Set up the color palette
    palette = {"Benign": "#A2CA71", "Malicious": "#F6E96B"}

    # Plot the data
    plt.figure(figsize=(6, 4))
    ax = sns.barplot(
        x="Model", y="Overall Score", hue="Intent", data=plot_data, palette=palette
    )

    # Adding labels and title
    plt.xlabel("")
    plt.ylabel("Overall Risk Ratio")
    plt.ylim(0, 1)

    for p in ax.patches:
        height = p.get_height()
        if height > 0:  # Only annotate bars with a positive height
            ax.annotate(
                f"{height:.2f}",
                (p.get_x() + p.get_width() / 2.0, height),
                ha="center",
                va="bottom",  # Adjust the vertical alignment to be 'bottom'
                xytext=(0, 8),
                textcoords="offset points",
                fontsize=10,
                color="black",
            )

    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.legend(title="Intent", title_fontsize="10", fontsize="8")

    # Improve layout
    plt.tight_layout()

    # Show the plot
    plt.show()


draw_overall_score_bar_plot(model_rewards, save_path="./human_intent_plot.pdf")

# Sec 6.3: Access to the tools


In [None]:
from typing import Dict

import matplotlib.pyplot as plt
import seaborn as sns


def draw_efficiency_goal_bar_plot(
    data: Dict[str, Dict[str, Tuple[float, float]]], save_path: str = None
) -> None:
    # Prepare data for plotting
    models = []
    scores = []
    metrics = []

    for model_name, metrics_dict in data.items():
        models.append(models_mapping[model_name])
        scores.append(
            metrics_dict["efficiency"][0]
        )  # Use the first element of the tuple for efficiency
        metrics.append("Efficiency")

        models.append(models_mapping[model_name])
        scores.append(
            metrics_dict["goal"][0]
        )  # Use the first element of the tuple for goal
        metrics.append("Goal")

    # Create a DataFrame for easier plotting
    plot_data = {"Model": models, "Score": scores, "Metric": metrics}

    # Set up the color palette
    palette = {"Efficiency": "#69db7c", "Goal": "#4dabf7"}

    # Plot the data
    plt.figure(figsize=(6, 4))
    ax = sns.barplot(
        x="Model", y="Score", hue="Metric", data=plot_data, palette=palette
    )

    # Adding labels and title
    plt.xlabel("")
    plt.ylabel("Score")
    plt.ylim(min(scores) - 1, max(scores) + 1)

    # Add the scores above the bars
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(
            f"{height:.2f}",
            (p.get_x() + p.get_width() / 2.0, height),
            ha="center",
            va="bottom",  # Adjust the vertical alignment to be 'bottom'
            xytext=(0, 8),
            textcoords="offset points",
            fontsize=10,
            color="black",
        )

    # Improve visual spacing
    if save_path:
        plt.savefig(save_path, format="pdf", bbox_inches="tight")
    plt.legend(title="Metric", title_fontsize="10", fontsize="8")

    # Improve layout
    plt.tight_layout()

    # Show the plot
    plt.show()


draw_efficiency_goal_bar_plot(performance_data, save_path="./efficiency_goal_plot.pdf")