In [None]:
# Imports
from dotenv import load_dotenv
from imblearn.under_sampling import RandomUnderSampler
from json import dump
from json import load as json_load
from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from time import time
import joblib
import numpy as np
import os
import polars as pl

In [None]:
# Load environment variables
load_dotenv()

# Get the directory of the current file
__dir__ = Path(os.path.abspath(""))
"""
The directory of the current file
"""

# Load environment variables
EMBEDDING_MODEL_NAME = os.environ["EMBEDDING_MODEL_NAME"]
"""
Base model name
"""

# Create the output directory
OUTPUT_DIRECTORY = __dir__ / f"../data/notebooks/classifier-multiclass/{EMBEDDING_MODEL_NAME.replace("/", "-")}"
OUTPUT_DIRECTORY.mkdir(parents=True, exist_ok=True)

In [None]:
# Load the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
)

In [None]:
# Encoding mappings
goal2id = {
    "benign": 0,
    "action manipulation": 1,
    "denial of service": 2,
    "data exfiltration": 3,
    "prompt exfiltration": 4,
    "jailbreaking": 5,
}
"""
Mapping of goal names to IDs

Note: goals are mutually exclusive in this dataset.
"""


def map_goal2id(goal_name: str) -> int:
    """
    Map a goal name to its corresponding ID.
    """

    if goal_name not in goal2id:
        raise ValueError(f"Unknown goal name: {goal_name}")

    return goal2id[goal_name]

In [None]:
# Load synthetic dataset dataset
with open(__dir__ / "../data/synthetic-dataset/aggregated.json", "r", encoding="utf-8") as aggregated_file:
  aggregated_synthetic_dataset = json_load(aggregated_file)

In [None]:
# Label and embed the messages
data = []

# Pre-encode the benign goal
benign_goal_id = map_goal2id("benign")

for aggregated_idea in aggregated_synthetic_dataset["ideas"]:
    for benign_conversation in aggregated_idea["benign_conversations"]:
        # Embed the messages
        message_embeddings = embedding_model.embed_documents(
            texts=[message["content"] for message in benign_conversation["messages"]]
        )

        # Add the messages
        for message_embedding in message_embeddings:
            data.append((message_embedding, benign_goal_id))

    for malicious_goal in aggregated_idea["malicious_goals"]:
        # Encode the goal
        malicious_goal_id = map_goal2id(malicious_goal["goal"])

        for malicious_conversation in malicious_goal["malicious_conversations"]:
            # Embed the messages
            malicious_message_embeddings = embedding_model.embed_documents(
                texts=[
                    message["content"]
                    for message in malicious_conversation["conversation"]["messages"]
                    if "injection" in message and message["injection"]
                ]
            )

            # Add the messages
            for malicious_message_embedding in malicious_message_embeddings:
                data.append((malicious_message_embedding, malicious_goal_id))

In [None]:
# Split the data
train_split, test_split = train_test_split(data, test_size=0.2, shuffle=True, random_state=0)

In [None]:
# Under-sample the train split to balance classes
undersampler = RandomUnderSampler(random_state=0)
balanced_train_split = list(zip(*undersampler.fit_resample( # type: ignore
    [item[0] for item in train_split],
    [item[1] for item in train_split]
)))

In [None]:
# Create the dataframes
train_df = pl.DataFrame(balanced_train_split, schema=["embeddings", "goal_id"], orient="row")
test_df = pl.DataFrame(test_split, schema=["embeddings", "goal_id"], orient="row")

In [None]:
# Train the classifier
goal_classifier = RandomForestClassifier(criterion="gini", n_estimators=200, random_state=0, verbose=2)
goal_classifier.fit(
    train_df["embeddings"].to_list(),
    train_df["goal_id"].to_list()
)

In [None]:
# Save the classifier
joblib.dump(goal_classifier, OUTPUT_DIRECTORY / "random-forest-goal.joblib")

In [None]:
# Benchmark the classifier
goal_start = time()
raw_goal_y_probabilities = goal_classifier.predict_proba(test_df["embeddings"].to_list())
goal_end = time()

# Log the classification time
elapsed_goal_classify_time = goal_end - goal_start

In [None]:
raw_goal_y_probabilities

In [None]:
# Convert multi-class Y probabilities to multi-class Y predictions
goal_y_predictions = [
    np.argmax(class_probabilities) for class_probabilities in raw_goal_y_probabilities
]

In [None]:
# Get the Y actual labels
goal_y_actual = test_df["goal_id"].to_list()

In [None]:
# Save the results
with open(OUTPUT_DIRECTORY / "results.json", "w", encoding="utf-8") as results_file:
    dump(
        {
            "auc": roc_auc_score(
                goal_y_actual, raw_goal_y_probabilities, multi_class="ovr"
            ),
            "report": classification_report(
                goal_y_actual,
                goal_y_predictions,
                target_names=list(goal2id.keys()),
                output_dict=True,
            ),
            "classify_time": elapsed_goal_classify_time,
            "total_time": elapsed_goal_classify_time,
        },
        results_file,
        indent=2,
    )