In [None]:
# Imports
from dotenv import load_dotenv
from glob import glob
from itertools import product
from json import dump as json_dump, load as json_load
from langchain_huggingface import HuggingFaceEmbeddings
from matplotlib import pyplot as plt
from pathlib import Path
from sklearn.metrics import classification_report, roc_auc_score
from time import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import joblib
import os
import polars as pl
import torch

In [None]:
# Load environment variables
load_dotenv()

# Get the directory of the current file
__dir__ = Path(os.path.abspath(""))
"""
The directory of the current file
"""

# Load environment variables
DATASET_NAME = os.environ["DATASET_NAME"]
"""
Dataset name
"""

EMBEDDING_MODEL_NAME = os.environ["EMBEDDING_MODEL_NAME"]
"""
Embedding model name
"""

CLASSIFIER_MODEL_NAME = os.environ["CLASSIFIER_MODEL_NAME"]
"""
Classifier model name
"""

# Create the output directory
OUTPUT_DIRECTORY = __dir__ / f"../data/notebooks/parameter-search-hybrid/{DATASET_NAME.replace("/", "-")}/{EMBEDDING_MODEL_NAME.replace("/", "-")}/{CLASSIFIER_MODEL_NAME.replace("/", "-")}"
(OUTPUT_DIRECTORY / "results").mkdir(parents=True, exist_ok=True)

In [None]:
# Load the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    show_progress=False,
)

In [None]:
# Load the tokenizer and classifier models
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(CLASSIFIER_MODEL_NAME)
llm_model = AutoModelForSequenceClassification.from_pretrained(CLASSIFIER_MODEL_NAME, dtype=dtype)

In [None]:
# Create the classifier pipeline
llm_classifier = pipeline(
  "text-classification",
  model=llm_model,
  tokenizer=tokenizer,
  truncation=True,
  max_length=512,
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

In [None]:
# Load the prepared dataset
test_df = pl.read_parquet(__dir__ / f"../data/notebooks/prepare-datasets/{DATASET_NAME.replace("/", "-")}/test.parquet")

In [None]:
# Load the classifier
base_classifier = joblib.load(__dir__ / f"../data/notebooks/classifier-embedding/{DATASET_NAME.replace("/", "-")}/{EMBEDDING_MODEL_NAME.replace("/", "-")}/random-forest.joblib")

In [None]:
def hybrid_classify(min_benign_confidence: float, min_malicious_confidence: float):
  """
  Hybrid classifier
  """

  def hybrid_classify_worker(text: str) -> bool:
    # Embed the text
    embedding = embedding_model.embed_documents(texts=[text])[0]

    # Classify the embedding
    prediction = base_classifier.predict_proba([embedding])[0]

    # Determine the label based on confidence
    if prediction[0] >= min_benign_confidence:
      return False # Benign
    elif prediction[1] >= min_malicious_confidence:
      return True # Malicious
    else:
      # Adjudicate with the LLM
      return llm_classifier(text)[0]["label"] != llm_model.config.id2label[0]
  return hybrid_classify_worker

In [None]:
y_actual = test_df["label"].to_list()

def evaluate(ctx):
  """
  Evaluate the hybrid classifier for the given parameters
  """

  # Unpack the context
  index = ctx["index"]
  params = ctx["params"]
  min_benign_confidence = params["min_benign_confidence"]
  min_malicious_confidence = params["min_malicious_confidence"]

  try:
    with open(OUTPUT_DIRECTORY / f"results/{index}.json", "x") as result_file:
      # Classify the test dataset
      start = time()
      y_predictions = test_df["text"].map_elements(
        hybrid_classify(min_benign_confidence, min_malicious_confidence)
      )
      end = time()

      elapsed_test_classify = end - start

      # Get the metrics
      auc = roc_auc_score(y_actual, y_predictions)
      report = classification_report(y_actual, y_predictions, target_names=["benign", "malicious"], output_dict=True)
      
      # Save the results
      result = {
        "index": index,
        "params": params,
        "time": elapsed_test_classify,
        "auc": auc,
        "report": report,
      }

      json_dump(result, result_file, indent=4)
  except FileExistsError as err:
    print(f"Result for index {index} already exists ({err}). Skipping...")

In [None]:
# Generate the contexts
param_grid = {
    "min_benign_confidence": [i / 10 for i in range(0, 11)],
    "min_malicious_confidence": [i / 10 for i in range(0, 11)],
}
contexts = [
  {
    "index": i,
    "params": dict(zip(param_grid.keys(), values))
  } for i, values in enumerate(product(*param_grid.values()))
]

In [None]:
# Grid search
for ctx in tqdm(contexts):
  evaluate(ctx)

In [None]:
# Load the results
results = []
for filename in glob(str((OUTPUT_DIRECTORY / "results/*.json").resolve())):
    with open(filename, "r") as result_file:
        raw_result = json_load(result_file)
        result = {
            "index": raw_result["index"],
            "auc": raw_result["auc"],
            "time": raw_result["time"],
            **({
                f"params_{k}": v
                for k, v in raw_result["params"].items()
            } | {
                f"report_{k}": v
                for k, v in raw_result["report"].items() if k in ["accuracy"]
            } | {
                f"report_{k1}_{k2}": v2
                for k1, v1 in raw_result["report"].items() if k1 in ["benign", "malicious", "macro avg", "weighted avg"]
                for k2, v2 in v1.items() if k2 in ["precision", "recall", "f1-score", "support"]
            })
        }
        results.append(result)

results_df = pl.DataFrame(results)

In [None]:
ROWS = 11
COLS = 11
VISIBLE_PARAMETERS = {
    "min_benign_confidence": "Minimum Benign Confidence",
    "min_malicious_confidence": "Minimum Malicious Confidence",
}
VISIBLE_REPORT_METRICS = {
    "auc": "AUC",
    "report_accuracy": "Accuracy",
    "report_weighted avg_precision": "Precision",
    "report_weighted avg_recall": "Recall",
    "report_weighted avg_f1-score": "F1 Score",
}
COLORS = ["blue", "green", "orange", "red", "purple", "gray"]

fig, axs = plt.subplots(ROWS, COLS, figsize=(20, 20))

# Plot the results
max_time = max(results_df["time"])
for row, col in product(range(ROWS), range(COLS)):
    index = row * COLS + col

    # Get the result
    result = results_df.filter(pl.col("index") == index).to_dicts()[0]

    # Plot the result
    axs[row, col].set_title(f"Result {index}")
    axs[row, col].bar(
        list(VISIBLE_REPORT_METRICS.values()),
        [result[k] for k in VISIBLE_REPORT_METRICS.keys()],
        color=COLORS[:len(VISIBLE_REPORT_METRICS)],
    )
    axs[row, col].set_xticks([])
    axs[row, col].set_xlabel(
        f"({", ".join([
            str(result[f"params_{k}"])
            for k in VISIBLE_PARAMETERS.keys()
        ])})"
    )
    axs[row, col].set_ylim(0, 1)

    # Secondary axis for time
    ax2 = axs[row, col].twinx()
    ax2.bar(
        ["Time (s)"],
        [result["time"]],
        color=COLORS[-1],
        alpha=0.5,
    )
    ax2.set_ylim(0, max_time * 1.1)

# Title and labels
fig.suptitle("Hybrid Classifier Parameter Search Results", fontsize=16, y=1)
fig.supxlabel(f"Hyperparameters ({", ".join(VISIBLE_PARAMETERS.values())})", fontsize=12)

# Legend
handles = [
    plt.Rectangle((0, 0), 1, 1, color=color)
    for color in COLORS
]
labels = [
    *VISIBLE_REPORT_METRICS.values(),
    "Time (s)"
]
fig.legend(handles, labels, loc="upper right")

plt.tight_layout()
plt.savefig(OUTPUT_DIRECTORY / "results.png", dpi=600)