In [None]:
# Imports
from json import load
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
# Get the directory of the current file
__dir__ = Path(os.path.abspath(""))
"""
The directory of the current file
"""

# Create the output directory
OUTPUT_DIRECTORY = __dir__ / "../data/notebooks/results-analysis"
OUTPUT_DIRECTORY.mkdir(parents=True, exist_ok=True)

In [None]:
def format_latex_table(data: dict[str, dict[str, float | int]]) -> str:
    """
    Format a LaTeX table from the given data.
    """

    # Get the columns
    columns = list(next(iter(data.values())).keys())

    # Format the table header
    formatted_table = f"\\begin{{tabular}}{{|l|{
      "r|" * len(columns)
    }}}\n\\hline\n{' & '.join(["Defense"] + columns)}\\\\\n\\hline\n"

    # Format the table rows
    for row_name, row_data in data.items():
        formatted_table += f"{row_name} & " + " & ".join(
            f"{row_data[col]:.4f}" if isinstance(row_data[col], float) or isinstance(row_data[col], int) else str(row_data[col])
            for col in columns
        ) + "\\\\\n"

    # Format the table footer
    formatted_table += "\\hline\n\\end{tabular}"

    return formatted_table

In [None]:
def load_static_result(path: Path):
    """
    Load static results from a JSON file.
    """

    with open(path, "r", encoding="utf-8") as file:
        data = load(file)

    return {
        "AUC": data["auc"],
        "Accuracy": data["report"]["accuracy"],
        "Precision": data["report"]["weighted avg"]["precision"],
        "Recall": data["report"]["weighted avg"]["recall"],
        "F1": data["report"]["weighted avg"]["f1-score"],
        "Time": data["total_time"],
    }


DATASETS = {
    "ahsanayub/malicious-prompts": "ahsanayub/malicious-prompts",
    "jayavibhav/prompt-injection-safety": "jayavibhav/prompt-injection-safety",
    "synthetic-dataset": "synthetic-dataset",
}
"""
Mapping of dataset identifiers to their display names.
"""

EMBEDDING_MODELS = {
    "sentence-transformers/all-MiniLM-L6-v2": "all-MiniLM",
    "thenlper/gte-large": "gte-large",
}
"""
Mapping of embedding model identifiers to their display names.
"""

CLASSIFIER_MODELS = {
    "protectai/deberta-v3-base-prompt-injection-v2": "deberta-v2",
    "meta-llama/Prompt-Guard-86M": "Prompt-Guard",
    "meta-llama/Llama-Prompt-Guard-2-86M": "Prompt-Guard-2",
    "qualifire/prompt-injection-jailbreak-sentinel-v2": "sentinel-v2",
}
"""
Mapping of classifier model identifiers to their display names.
"""

# Load static results
static_results = {
    dataset_name: {
        # Embedding classifier
        **{
            embedding_model_name: load_static_result(
                __dir__
                / f"../data/notebooks/classifier-embedding/{dataset_id.replace("/", "-")}/{embedding_model_id.replace("/", "-")}/results.json",
            )
            for embedding_model_id, embedding_model_name in EMBEDDING_MODELS.items()
        },
        # Transformer classifier
        **{
            classifier_model_name: load_static_result(
                __dir__
                / f"../data/notebooks/classifier-transformer/{dataset_id.replace("/", "-")}/{classifier_model_id.replace("/", "-")}/results.json",
            )
            for classifier_model_id, classifier_model_name in CLASSIFIER_MODELS.items()
        },
        # Hybrid classifier
        **{
            f"{embedding_model_name}-{classifier_model_name}": load_static_result(
                __dir__
                / f"../data/notebooks/classifier-hybrid/{dataset_id.replace("/", "-")}/{embedding_model_id.replace("/", "-")}/{classifier_model_id.replace("/", "-")}/results.json",
            )
            for embedding_model_id, embedding_model_name in EMBEDDING_MODELS.items()
            for classifier_model_id, classifier_model_name in CLASSIFIER_MODELS.items()
        },
    }
    for dataset_id, dataset_name in DATASETS.items()
}

In [None]:
# Create the static results chart
METRICS = ["AUC", "Accuracy", "Precision", "Recall", "F1"]
"""
Metrics to be displayed in the chart.
"""

# Graph the results
fig, axs = plt.subplots(1, 3, figsize=(20, 6))

for i, (dataset, results) in enumerate(static_results.items()):
  ax = axs[i]
  model_names = list(results.keys())
  x = range(len(model_names))

  # Bars for each metric
  for metric in METRICS:
    y = [results[model][metric] for model in model_names]
    ax.bar(
      [p + METRICS.index(metric) * 0.1 for p in x],
      y,
      width=0.1,
      label=metric,
    )

  ax.set_xticks([p + 0.2 for p in x])
  ax.set_xticklabels(model_names, rotation=45, ha="right")
  ax.set_title(f"Performance on {dataset}")
  ax.set_ylabel("Score")
  ax.set_ylim(0, 1)

  # Secondary axis for time
  ax2 = ax.twinx()
  time_values = [results[model]["Time"] for model in model_names]
  time_line, = ax2.plot([p + 0.2 for p in x], time_values, color="black", marker="o", label="Time")
  ax2.set_ylabel("Time (s)", color="black")
  ax2.tick_params(axis="y", labelcolor="black")

  # Combine legends
  handles1, labels1 = ax.get_legend_handles_labels()
  handles2, labels2 = ax2.get_legend_handles_labels()
  ax.legend(handles1 + [time_line], labels1 + ["Time"], loc="lower right", facecolor="white")

plt.tight_layout()
plt.savefig(OUTPUT_DIRECTORY / "static-results.png", dpi=600)

In [None]:
# Print LaTeX table for each dataset
for dataset, results in static_results.items():
    print(f"LaTeX table for {dataset}:\n{format_latex_table(results)}")

In [None]:
def load_dynamic_result(path: Path):
    """
    Load dynamic results from a JSON file.
    """

    with open(path, "r", encoding="utf-8") as file:
        data = load(file)

    def calculate_fractional_metric(value: dict[str, int | float]) -> float:
        """
        Calculate a fractional metric from a dictionary containing 'numerator' and 'denominator'.
        """

        return value["numerator"] / value["denominator"] if value["denominator"] != 0 else 0.0

    return {
        "Injection Task Utility": calculate_fractional_metric(data["statistics"]["injection_task_utility"]),
        "Utility Under Attack": calculate_fractional_metric(data["statistics"]["utility_under_attack"]),
        "Attack Success Rate": calculate_fractional_metric(data["statistics"]["targeted_attack_success_rate"]),
        "Benign Utility": calculate_fractional_metric(data["statistics"]["benign_utility"]),
        "Mean Overall Overhead": np.mean(data["statistics"]["overall_overheads"]),
        "Mean Benign Overhead": np.mean(data["statistics"]["benign_overheads"]),
        "P50 Overall Overhead": np.percentile(data["statistics"]["overall_overheads"], 50),
        "P50 Benign Overhead": np.percentile(data["statistics"]["benign_overheads"], 50),
        "P95 Overall Overhead": np.percentile(data["statistics"]["overall_overheads"], 95),
        "P95 Benign Overhead": np.percentile(data["statistics"]["benign_overheads"], 95),
        "P99 Overall Overhead": np.percentile(data["statistics"]["overall_overheads"], 99),
        "P99 Benign Overhead": np.percentile(data["statistics"]["benign_overheads"], 99),
    }

# Load the dynamic results data
dynamic_results = {
    "none": load_dynamic_result(
        __dir__ / "../data/agentdojo/none/results.json"
    ),
    "data-delimiter": load_dynamic_result(
        __dir__ / "../data/agentdojo/data-delimiter/results.json"
    ),
    "embedding": load_dynamic_result(
        __dir__ / "../data/agentdojo/embedding/results.json"
    ),
    "transformer-deberta-v2": load_dynamic_result(
        __dir__ / "../data/agentdojo/transformer/protectai-deberta-v3-base-prompt-injection-v2/results.json"
    ),
    "hybrid-deberta-v2": load_dynamic_result(
        __dir__ / "../data/agentdojo/hybrid/protectai-deberta-v3-base-prompt-injection-v2/results.json"
    ),
    "transformer-sentinel-v2": load_dynamic_result(
        __dir__ / "../data/agentdojo/transformer/qualifire-prompt-injection-jailbreak-sentinel-v2/results.json"
    ),
    "hybrid-sentinel-v2": load_dynamic_result(
        __dir__ / "../data/agentdojo/hybrid/qualifire-prompt-injection-jailbreak-sentinel-v2/results.json"
    ),
    "llm": load_dynamic_result(
        __dir__ / "../data/agentdojo/llm/results.json"
    ),
    "multiclass": load_dynamic_result(
        __dir__ / "../data/agentdojo/multiclass/results.json"
    ),
}

In [None]:
# Create the dynamic results charts
PERFORMANCE_METRICS = [
    "Injection Task Utility",
    "Utility Under Attack",
    "Attack Success Rate",
    "Benign Utility",
]
"""
Metrics related to task performance.
"""

LATENCY_METRICS = [
    "Mean Overall Overhead",
    "Mean Benign Overhead",
    "P50 Overall Overhead",
    "P50 Benign Overhead",
    "P95 Overall Overhead",
    "P95 Benign Overhead",
    "P99 Overall Overhead",
    "P99 Benign Overhead",
]
"""
Metrics related to latency overhead.
"""

LATENCY_SKIP_DATASETS = [
    "none",
    "data-delimiter",
]
"""
Datasets to skip for latency metrics (i.e., that don't have any overhead/latency).
"""

# Graph the dynamic results
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Graph the performance metrics
x = range(len(dynamic_results.keys()))
for metric in PERFORMANCE_METRICS:
    y = [dynamic_results[defense][metric] for defense in dynamic_results.keys()]
    axs[0].bar(
        [p + PERFORMANCE_METRICS.index(metric) * 0.1 for p in x],
        y,
        width=0.1,
        label=metric,
    )

axs[0].set_title("Task Performance")
axs[0].set_xticks([p + 0.15 for p in x])
axs[0].set_xticklabels(dynamic_results.keys(), rotation=45, ha="right")
axs[0].set_ylim(0)
axs[0].set_ylabel("Score")
axs[0].legend(loc="upper center", facecolor="white")

# Graph the latency metrics
x = range(len(dynamic_results.keys()) - len(LATENCY_SKIP_DATASETS))
for metric in LATENCY_METRICS:
    y = [
        dynamic_results[defense][metric]
        for defense in dynamic_results.keys()
        if defense not in LATENCY_SKIP_DATASETS
    ]
    axs[1].bar(
        [p + LATENCY_METRICS.index(metric) * 0.1 for p in x], y, width=0.1, label=metric
    )

axs[1].set_title("Latency Overhead")
axs[1].set_xticks([p + 0.35 for p in x])
axs[1].set_xticklabels(
    [
        defense
        for defense in dynamic_results.keys()
        if defense not in LATENCY_SKIP_DATASETS
    ],
    rotation=45,
    ha="right",
)
axs[1].set_yscale("log")
axs[1].set_ylim(0, 5)
axs[1].set_ylabel("Defense Overhead (Milliseconds)")
axs[1].legend(loc="upper center", facecolor="white")

plt.tight_layout()
plt.savefig(OUTPUT_DIRECTORY / "dynamic-results.png", dpi=600)

In [None]:
# Print LaTeX table
print(f"Dynamic results:\n{format_latex_table(dynamic_results)}")