In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from models import create_dynamic_neural_network
from train import train_adversarial, compute_validation_metrics
import dataloader_ids
from dataloader_ids import load_and_prepare_data

import optuna

import os
import time
import json
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)
print(torch.cuda.is_available())

In [None]:
# Objective Function
def objective(trial, dataset_key, encoding_key, multiclass=False, verbose=False):

    suggested_batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])

    if encoding_key == 'Raw':
        if suggested_batch_size != 256:
            suggested_batch_size = 256 
             
    num_layers = trial.suggest_int("num_layers", 2, 4)
    hidden_dims = [
        trial.suggest_int(f"hidden_dim_layer_{i+1}", 16, 128, step=16)
        for i in range(num_layers)
    ]
    dropout_rate = trial.suggest_float("dropout", 0.0, 0.5)
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-1, log=True)

    print(f"Trial: {trial.number}, Batch Size: {suggested_batch_size}, Hidden Dims: {hidden_dims}, Dropout: {dropout_rate}, LR: {learning_rate}")

    ###### IDS ######
    train_loader, _, test_loader, _, input_dim, output_dim, _, _, _ = load_and_prepare_data(
        dataset_key=dataset_key, 
        encoding_key=encoding_key, 
        multiclass=multiclass, 
        batch_size=suggested_batch_size, 
        model_discovery=True
    )
    
    print(f"Dataset size: {len(train_loader.dataset)}, Batch size: {suggested_batch_size}, Num batches: {len(train_loader)}")

    train_size = len(train_loader.dataset)
    if train_size < suggested_batch_size:
        adjusted_batch_size = max(16, train_size // 10)  # Default to at least 10 batches if possible
        print(f"Adjusted batch size to {adjusted_batch_size} due to small dataset size.")
        train_loader, _, test_loader, _, input_dim, output_dim, _, _, _ = load_and_prepare_data(
            dataset_key=dataset_key, 
            encoding_key=encoding_key, 
            multiclass=multiclass, 
            batch_size=adjusted_batch_size, 
            model_discovery=True
        )


    # Create the model
    model, criterion, optimizer = create_dynamic_neural_network(
        input_dim=input_dim,
        output_dim=output_dim,
        multiclass=multiclass,
        hidden_dims=hidden_dims,
        optimizer="adam",
        lr=learning_rate,
        dropout_rate=dropout_rate
    )
    model = model.to(device)

    # Train the model
    results, model = train_adversarial(
        model, train_loader, test_loader, optimizer, criterion, device, num_epochs=NUM_EPOCHS, verbose=verbose
    )

    # Extract metrics
    val_loss = results["val_losses"][-1]
    val_acc = results["val_accuracies"][-1]
    val_precision = results["val_precisions"][-1]
    val_recall = results["val_recalls"][-1]
    val_f1 = results["val_f1s"][-1]

    # Store additional metrics in the trial
    trial.set_user_attr("val_acc", val_acc)
    trial.set_user_attr("val_f1", val_f1)
    trial.set_user_attr("val_precision", val_precision)
    trial.set_user_attr("val_recall", val_recall)

    print(f"Trial: {trial.number}, Val Loss: {val_loss}, Val Acc: {val_acc}, Val Precision: {val_precision}, Val Recall: {val_recall}, Val F1: {val_f1}")

    return val_loss


In [None]:
def seed_trial(study, dataset_key, encoding_key, multiclass, verbose):
    seed_params = {
        "num_layers": 2,
        "hidden_dim_layer_1": 64,
        "hidden_dim_layer_2": 32,
        "dropout": 0.0,
        "lr": 0.001,
        "batch_size": 256 if encoding_key in ["Raw", "Flows"] else 128
    }

    distributions = {
        "batch_size": optuna.distributions.CategoricalDistribution([64, 128, 256]),
        "num_layers": optuna.distributions.IntDistribution(2, 4),
        "hidden_dim_layer_1": optuna.distributions.IntDistribution(16, 128, step=16),
        "hidden_dim_layer_2": optuna.distributions.IntDistribution(16, 128, step=16),
        "dropout": optuna.distributions.FloatDistribution(0.0, 0.5),
        "lr": optuna.distributions.FloatDistribution(1e-5, 1e-1, log=True)
    }

    trial = optuna.trial.FrozenTrial(
        number=len(study.trials),
        trial_id=len(study.trials),
        state=optuna.trial.TrialState.COMPLETE,
        value=None,
        values=None,
        datetime_start=None,
        datetime_complete=None,
        params=seed_params,
        distributions=distributions,
        user_attrs={},
        system_attrs={},
        intermediate_values={},
    )

    # Evaluate the objective with seed parameters
    val_loss = objective(trial, dataset_key, encoding_key, multiclass, verbose)

    # Add the trial to the study
    study.add_trial(
        optuna.trial.FrozenTrial(
            number=len(study.trials),
            trial_id=len(study.trials),
            state=optuna.trial.TrialState.COMPLETE,
            value=val_loss,
            values=None,
            datetime_start=datetime.now(),
            datetime_complete=datetime.now(),
            params=seed_params,
            distributions=distributions,
            user_attrs={
                "seed_trial": True,
                "val_acc": trial.user_attrs["val_acc"],
                "val_f1": trial.user_attrs["val_f1"],
                "val_precision": trial.user_attrs["val_precision"],
                "val_recall": trial.user_attrs["val_recall"]
            },  # Mark as seed trial
            system_attrs={},
            intermediate_values={},
        )
    )

In [None]:
def check_convergence(study, threshold=0.001, patience=10):
    """
    Check if optimization is converging.
    
    Args:
        study (optuna.Study): The Optuna study object.
        threshold (float): Minimum improvement to consider as progress.
        patience (int): Number of trials to check for improvement.

    Returns:
        bool: True if optimization has converged, False otherwise.
    """
    # Get the best trial values
    trials = study.trials
    if len(trials) < patience + 1:
        return False  # Not enough trials to check convergence

    # Calculate improvement over the last `patience` trials
    recent_losses = [t.value for t in trials[-patience - 1:]]
    improvement = abs(recent_losses[-1] - min(recent_losses[:-1]))

    return improvement < threshold


def write_all_trials(study, dataset, encoding):
    """
    Write all trials to a file, including the best trial index.
    
    Args:
        study (optuna.Study): The Optuna study object.
        dataset (str): Dataset name.
        encoding (str): Encoding type.
    """
    trial_data = []
    for trial in study.trials:
        trial_data.append({
            "trial_number": trial.number,
            "params": trial.params,
            "value": trial.value,
            "user_attrs": trial.user_attrs,
            "duration": str(trial.duration)
        })
    
    # Write all trial data
    base_dir = "results/model_discovery"

    os.makedirs(f"{base_dir}/{dataset}/{encoding}/", exist_ok=True)
    with open(f"{base_dir}/{dataset}/{encoding}/all_trials.json", "w") as f:
        json.dump(trial_data, f, indent=4)
    
    # Write the index of the best trial
    best_trial_index = study.best_trial.number
    with open(f"{base_dir}/{dataset}/{encoding}/best_trial_index.json", "w") as f:
        json.dump({"best_trial_index": best_trial_index}, f)

def optimize_with_convergence(dataset, encoding, multiclass, max_trials=50, threshold=0.001, patience=10, verbose=False):
    print(f"Starting optimization for dataset: {dataset}, encoding: {encoding}")

    study = optuna.create_study(direction="minimize")
    seed_trial(study, dataset_key=dataset.lower(), encoding_key=encoding, multiclass=multiclass, verbose=verbose)
    
    no_improvement_trials = 0
    best_loss_so_far = float('inf')

    for trial_idx in range(max_trials - 1):
        study.optimize(
            lambda trial: objective(trial, dataset_key=dataset.lower(), encoding_key=encoding, multiclass=multiclass, verbose=verbose),
            n_trials=1,
            callbacks=[
                lambda study, trial: print(
                    f"Trial {trial.number}/{max_trials} finished in {trial.duration}. Loss: {trial.value:.4f}"
                )
            ]
        )

        best_loss_in_study = study.best_value
        if best_loss_in_study < best_loss_so_far - threshold:
            best_loss_so_far = best_loss_in_study
            no_improvement_trials = 0
        else:
            no_improvement_trials += 1

        if no_improvement_trials >= patience:
            print(f"Early stopping triggered: No improvement in the last {patience} trials.")
            break

    best_trial = study.best_trial

    print("\nOptimization completed.")
    print(f"Best trial number: {best_trial.number}")
    print(f"Best hyperparameters: {best_trial.params}")
    print(f"Best validation loss: {best_trial.value}")
    print(f"Best accuracy: {best_trial.user_attrs.get('val_acc')}")
    print(f"Best F1 score: {best_trial.user_attrs.get('val_f1')}")

    # Write all trials and best trial index to files
    write_all_trials(study, dataset, encoding)

    return best_trial.params, best_trial.user_attrs



# MAIN

In [None]:
DATASETS = ['mirai', 'unsw-nb15']
MULTICLASS = [False, True]

ENCODINGS = ['Raw', 'DM', 'Stats']

NUM_EPOCHS = 100
MAX_TRIALS = 20
EARLY_STOP_PATIENCE = 5
MIN_DELTA = 0.001

In [None]:
import importlib

importlib.reload(dataloader_ids)
from dataloader_ids import load_and_prepare_data


for multiclass, dataset in zip(MULTICLASS, DATASETS):
    for encoding in ENCODINGS:
        best_params, best_metrics = optimize_with_convergence(
            dataset=dataset,
            encoding=encoding,
            multiclass=multiclass,
            max_trials=MAX_TRIALS,
            threshold=MIN_DELTA,
            patience=EARLY_STOP_PATIENCE,
            verbose=True        
        )
        base_dir = "results/model_discovery"
        os.makedirs(f"{base_dir}/{dataset}/{encoding}/", exist_ok=True)
        with open(f"{base_dir}/{dataset}/{encoding}/best_params.json", "w") as f:
            json.dump(best_params, f)
        with open(f"{base_dir}/{dataset}/{encoding}/best_metrics.json", "w") as f:
            json.dump(best_metrics, f)


# CV ASSESSMENT OF BEST PARAMS

In [None]:
def extract_hidden_dims(params):
    hidden_dims = []
    for key, value in params.items():
        if key.startswith("hidden_dim_layer_"):
            hidden_dims.append(value)

    # Sort by layer index in case keys are unordered
    hidden_dims = [v for k, v in sorted((key, value) for key, value in params.items() if key.startswith("hidden_dim_layer_"))]
    return hidden_dims

In [None]:
import json
import os
import numpy as np

import importlib

importlib.reload(dataloader_ids)
from dataloader_ids import load_and_prepare_data


for multiclass, dataset in zip(MULTICLASS, DATASETS):
    for encoding in ENCODINGS:
        print(f"Running stratified CV for dataset: {dataset}")

        results_dir = f"results/model_discovery/{dataset}/{encoding}/"
        best_params_fp = f"{results_dir}best_params.json"
        best_metrics_fp = f"{results_dir}best_metrics.json"
        fold_metrics_fp = f"{results_dir}fold_metrics.json"
        avg_metrics_fp = f"{results_dir}avg_metrics.json"

        # Load best params and metrics
        with open(best_params_fp, "r") as f:
            best_params = json.load(f)
            hidden_dims = extract_hidden_dims(best_params)
            learning_rate = best_params["lr"]
            dropout_rate = best_params["dropout"]
            batch_size = best_params["batch_size"]

        with open(best_metrics_fp, "r") as f:
            best_metrics = json.load(f)

        fold_metrics = []

        # Iterate over folds
        for fold_idx in [0, 1, 2, 3, 4]:
            print(f"Fold {fold_idx+1}")

            train_loader, val_loader, test_loader, _, input_dim, output_dim, y_mapping, scaler, _ = load_and_prepare_data(
                dataset_key=dataset.lower(),
                encoding_key=encoding,
                multiclass=multiclass,
                batch_size=batch_size,
                cv=True,
                cv_fold_index=fold_idx
            )
            print(f"Dataset size: {len(train_loader.dataset)}, Batch size: {batch_size}, Num batches: {len(train_loader)}")

            model, criterion, optimizer = create_dynamic_neural_network(
                input_dim=input_dim,
                output_dim=output_dim,
                multiclass=multiclass,
                hidden_dims=hidden_dims,
                optimizer="adam",
                lr=learning_rate,
                dropout_rate=dropout_rate
            )
            model = model.to(device)

            # Train and evaluate
            results, _ = train_adversarial(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                num_epochs=10,
                verbose=True,
            )

            print("\n--- Evaluating on Test Set ---")
            test_metrics = compute_validation_metrics(
                model=model,
                val_loader=test_loader,
                criterion=criterion,
                device=device,
            )
            avg_test_loss, test_accuracy, test_precision, test_recall, test_f1 = test_metrics
            print(f"Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy}, Test F1 Score: {test_f1}. Precision: {test_precision}, Recall: {test_recall}")  

            # Collect metrics
            fold_metrics.append({
                "fold": fold_idx + 1,
                "accuracy": results["val_accuracies"][-1],
                "f1": results["val_f1s"][-1],
                "precision": results["val_precisions"][-1],
                "recall": results["val_recalls"][-1],
                "train_loss": results["train_losses"],
                "val_loss": results["val_losses"],
            })

        # Save fold metrics
        os.makedirs(results_dir, exist_ok=True)
        with open(fold_metrics_fp, "w") as f:
            json.dump(fold_metrics, f, indent=4)

        # Average metrics
        avg_metrics = {
            metric: np.mean([fold[metric] for fold in fold_metrics]) for metric in ["accuracy", "f1", "precision", "recall"]
        }
        avg_metrics["val_loss"] = np.mean([fold["val_loss"][-1] for fold in fold_metrics])  # Last validation loss

        # Save average metrics
        with open(avg_metrics_fp, "w") as f:
            json.dump(avg_metrics, f, indent=4)

        # Print results
        print(f"Average metrics for {dataset}: {avg_metrics}")
        print(f"Best metrics for {dataset}: {best_metrics}")
