In [27]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_labels.csv'

In [28]:
# Cell 2: Load and Prepare Data with Enhanced Preprocessing
print("Loading and preprocessing training data...")
# Load features and labels
df_train = pd.read_csv(processed_train_path)
X = df_train
y_train = pd.read_csv(train_labels_path)
y_train_binary = y_train['label'].values

# Split data into training and validation sets (80-20)
X_train, X_val, y_train_split, y_val = train_test_split(
    X, 
    y_train_binary,
    test_size=0.2,
    random_state=42,
    stratify=y_train_binary
)

# Apply RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(X_train)
X_val_scaled = robust_scaler.transform(X_val)

# Feature selection
n_features = 40
selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
X_train_selected = selector.fit_transform(X_train_scaled, y_train_split)
X_val_selected = selector.transform(X_val_scaled)

print("\nDataset shapes after preprocessing:")
print(f"Training set: {X_train_selected.shape}")
print(f"Validation set: {X_val_selected.shape}")
print("Loading and preprocessing complete!")

Loading and preprocessing training data...

Dataset shapes after preprocessing:
Training set: (100778, 40)
Validation set: (25195, 40)
Loading and preprocessing complete!


In [29]:
# Cell 3: Initialize and Train Model with Hyperparameter Tuning
print("\nTraining Isolation Forest model with hyperparameter tuning...")

# Define parameters
params = {
    'n_estimators': [100, 200, 300],        # Tree number
    'max_samples': [0.5, 0.8, 'auto'],      # Sampling ratio
    'contamination': [0.1, 0.2, 0.3],       # Anomaly ratio
    'max_features': [0.6, 0.8, 1.0],        # Feature sampling ratio
    'bootstrap': [True, False]              # Whether to use bootstrap sampling
}

def custom_f1_scorer(estimator, X, y):
    try:
        predictions = estimator.predict(X)
        
        # Replace invalid values with 1
        predictions = np.nan_to_num(predictions, nan=1, posinf=1, neginf=-1)
        
        # Ensure values are within valid range
        predictions = np.clip(predictions, -1, 1)
        
        # Use safer conversion method
        labels = np.where(predictions > 0, 0, 1)
        
        return f1_score(y, labels, average='weighted')
    except Exception as e:
        print(f"Scoring error: {str(e)}")
        return 0.0

# Initialize grid search
grid_search = GridSearchCV(
    estimator=IsolationForest(n_jobs=1, random_state=42),
    param_grid=params,
    scoring=custom_f1_scorer,
    cv=3,                  # 3-fold cross-validation
    n_jobs=4,             
    verbose=1              
)

# Execute grid search
print("Starting grid search...")
grid_search.fit(X_train_selected, y_train_split)

# Get the best model
best_iso_forest = grid_search.best_estimator_
print("\nBest parameters found:")
print(grid_search.best_params_)
print(f"Best F1-score: {grid_search.best_score_:.3f}")

# Retrain with best parameters on full training set
print("\nRetraining with best parameters on full training set...")
final_model = grid_search.best_estimator_.fit(X_train_selected)


Training Isolation Forest model with hyperparameter tuning...
Starting grid search...
Fitting 3 folds for each of 162 candidates, totalling 486 fits

Best parameters found:
{'bootstrap': True, 'contamination': 0.3, 'max_features': 0.6, 'max_samples': 'auto', 'n_estimators': 300}
Best F1-score: 0.641

Retraining with best parameters on full training set...


In [30]:
# Cell 4: Evaluate on Training and Validation Sets
# Training Set Evaluation
print("\nEvaluating on Training Set...")
train_predictions = best_iso_forest.predict(X_train_selected)
train_scores = best_iso_forest.score_samples(X_train_selected)
train_labels = (-train_predictions/2 + 0.5).astype(int)

if len(np.unique(train_labels)) > 1:
    train_silhouette = silhouette_score(X_train_selected, train_labels)
    train_davies_bouldin = davies_bouldin_score(X_train_selected, train_labels)
    print("\nUnsupervised Metrics (Training Set):")
    print(f"Silhouette Score: {train_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {train_davies_bouldin:.3f}")

train_accuracy = accuracy_score(y_train_split, train_labels)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train_split, train_labels, average='binary')
train_conf_matrix = confusion_matrix(y_train_split, train_labels)

print("\nSupervised Metrics (Training Set):")
print(f"Accuracy: {train_accuracy:.3f}")
print(f"Precision: {train_precision:.3f}")
print(f"Recall: {train_recall:.3f}")
print(f"F1-score: {train_f1:.3f}")
print("\nConfusion Matrix (Training Set):")
print(train_conf_matrix)

# Validation Set Evaluation
print("\nEvaluating on Validation Set...")
val_predictions = best_iso_forest.predict(X_val_selected)
val_scores = best_iso_forest.score_samples(X_val_selected)
val_labels = (-val_predictions/2 + 0.5).astype(int)

if len(np.unique(val_labels)) > 1:
    val_silhouette = silhouette_score(X_val_selected, val_labels)
    val_davies_bouldin = davies_bouldin_score(X_val_selected, val_labels)
    print("\nUnsupervised Metrics (Validation Set):")
    print(f"Silhouette Score: {val_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {val_davies_bouldin:.3f}")

val_accuracy = accuracy_score(y_val, val_labels)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_labels, average='binary')
val_conf_matrix = confusion_matrix(y_val, val_labels)

print("\nSupervised Metrics (Validation Set):")
print(f"Accuracy: {val_accuracy:.3f}")
print(f"Precision: {val_precision:.3f}")
print(f"Recall: {val_recall:.3f}")
print(f"F1-score: {val_f1:.3f}")
print("\nConfusion Matrix (Validation Set):")
print(val_conf_matrix)


Evaluating on Training Set...

Unsupervised Metrics (Training Set):
Silhouette Score: 0.384
Davies-Bouldin Index: 2.398

Supervised Metrics (Training Set):
Accuracy: 0.656
Precision: 0.702
Recall: 0.453
F1-score: 0.550

Confusion Matrix (Training Set):
[[44868  9006]
 [25676 21228]]

Evaluating on Validation Set...

Unsupervised Metrics (Validation Set):
Silhouette Score: 0.381
Davies-Bouldin Index: 2.091

Supervised Metrics (Validation Set):
Accuracy: 0.662
Precision: 0.710
Recall: 0.462
F1-score: 0.559

Confusion Matrix (Validation Set):
[[11253  2216]
 [ 6311  5415]]


In [31]:
# Cell 5: Evaluate on External Test Set
print("\nEvaluating on test set...")
# Load and preprocess test data
df_test = pd.read_csv(processed_test_path)
X_test_scaled = robust_scaler.transform(df_test)
X_test_selected = selector.transform(X_test_scaled)

# Load test labels
y_test = pd.read_csv(test_labels_path)
y_test_binary = y_test['label'].values

# Get predictions
test_predictions = best_iso_forest.predict(X_test_selected)
test_scores = best_iso_forest.score_samples(X_test_selected)
test_labels = (-test_predictions/2 + 0.5).astype(int)

# Calculate metrics
if len(np.unique(test_labels)) > 1:
    test_silhouette = silhouette_score(X_test_selected, test_labels)
    test_davies_bouldin = davies_bouldin_score(X_test_selected, test_labels)
    print("\nUnsupervised Metrics (Test Set):")
    print(f"Silhouette Score: {test_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {test_davies_bouldin:.3f}")

test_accuracy = accuracy_score(y_test_binary, test_labels)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test_binary, test_labels, average='binary')
test_conf_matrix = confusion_matrix(y_test_binary, test_labels)

print("\nSupervised Metrics (Test Set):")
print(f"Accuracy: {test_accuracy:.3f}")
print(f"Precision: {test_precision:.3f}")
print(f"Recall: {test_recall:.3f}")
print(f"F1-score: {test_f1:.3f}")
print("\nConfusion Matrix (Test Set):")
print(test_conf_matrix)

# Print final distribution summary
print("\nSample Distribution Summary:")
print("\nTraining Set:")
print(f"Normal: {sum(train_labels == 0)}, Anomaly: {sum(train_labels == 1)}")
print("\nValidation Set:")
print(f"Normal: {sum(val_labels == 0)}, Anomaly: {sum(val_labels == 1)}")
print("\nTest Set:")
print(f"Normal: {sum(test_labels == 0)}, Anomaly: {sum(test_labels == 1)}")


Evaluating on test set...

Unsupervised Metrics (Test Set):
Silhouette Score: -0.649
Davies-Bouldin Index: 1.992

Supervised Metrics (Test Set):
Accuracy: 0.521
Precision: 0.549
Recall: 0.892
F1-score: 0.680

Confusion Matrix (Test Set):
[[  307  9404]
 [ 1387 11446]]

Sample Distribution Summary:

Training Set:
Normal: 70544, Anomaly: 30234

Validation Set:
Normal: 17564, Anomaly: 7631

Test Set:
Normal: 1694, Anomaly: 20850
