In [11]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_labels.csv'

In [12]:
# Cell 2: Load and Prepare Data with Enhanced Preprocessing
print("Loading and preprocessing training data...")
# Load features and labels
df_train = pd.read_csv(processed_train_path)
y_train = pd.read_csv(train_labels_path)
y_train_binary = (y_train['label'] != 'normal').astype(int)

# Split data into training and validation sets (80-20)
X_train, X_val, y_train_split, y_val = train_test_split(
    df_train, 
    y_train_binary,
    test_size=0.2,
    random_state=42,
    stratify=y_train_binary
)

# Apply RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(X_train)
X_val_scaled = robust_scaler.transform(X_val)

# Feature selection
n_features = 30
selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
X_train_selected = selector.fit_transform(X_train_scaled, y_train_split)
X_val_selected = selector.transform(X_val_scaled)

print("\nDataset shapes after preprocessing:")
print(f"Training set: {X_train_selected.shape}")
print(f"Validation set: {X_val_selected.shape}")
print("Loading and preprocessing complete!")

Loading and preprocessing training data...

Dataset shapes after preprocessing:
Training set: (100778, 30)
Validation set: (25195, 30)
Loading and preprocessing complete!


In [13]:
# Cell 3: Initialize and Train Model
print("\nTraining Isolation Forest model...")
# Calculate contamination rate from training data
train_contamination = np.mean(y_train_split)
print(f"Contamination rate in training data: {train_contamination:.3f}")

iso_forest = IsolationForest(
    n_estimators=200,
    max_samples=256,
    contamination=train_contamination,
    max_features=0.8,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

# Fit the model on training data
iso_forest.fit(X_train_selected)


Training Isolation Forest model...
Contamination rate in training data: 0.465


In [14]:
# Cell 4: Evaluate on Training and Validation Sets
# Training Set Evaluation
print("\nEvaluating on Training Set...")
train_predictions = iso_forest.predict(X_train_selected)
train_scores = iso_forest.score_samples(X_train_selected)
train_labels = (train_predictions == -1).astype(int)

if len(np.unique(train_labels)) > 1:
    train_silhouette = silhouette_score(X_train_selected, train_labels)
    train_davies_bouldin = davies_bouldin_score(X_train_selected, train_labels)
    print("\nUnsupervised Metrics (Training Set):")
    print(f"Silhouette Score: {train_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {train_davies_bouldin:.3f}")

train_accuracy = accuracy_score(y_train_split, train_labels)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train_split, train_labels, average='binary')
train_conf_matrix = confusion_matrix(y_train_split, train_labels)

print("\nSupervised Metrics (Training Set):")
print(f"Accuracy: {train_accuracy:.3f}")
print(f"Precision: {train_precision:.3f}")
print(f"Recall: {train_recall:.3f}")
print(f"F1-score: {train_f1:.3f}")
print("\nConfusion Matrix (Training Set):")
print(train_conf_matrix)

# Validation Set Evaluation
print("\nEvaluating on Validation Set...")
val_predictions = iso_forest.predict(X_val_selected)
val_scores = iso_forest.score_samples(X_val_selected)
val_labels = (val_predictions == -1).astype(int)

if len(np.unique(val_labels)) > 1:
    val_silhouette = silhouette_score(X_val_selected, val_labels)
    val_davies_bouldin = davies_bouldin_score(X_val_selected, val_labels)
    print("\nUnsupervised Metrics (Validation Set):")
    print(f"Silhouette Score: {val_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {val_davies_bouldin:.3f}")

val_accuracy = accuracy_score(y_val, val_labels)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_labels, average='binary')
val_conf_matrix = confusion_matrix(y_val, val_labels)

print("\nSupervised Metrics (Validation Set):")
print(f"Accuracy: {val_accuracy:.3f}")
print(f"Precision: {val_precision:.3f}")
print(f"Recall: {val_recall:.3f}")
print(f"F1-score: {val_f1:.3f}")
print("\nConfusion Matrix (Validation Set):")
print(val_conf_matrix)


Evaluating on Training Set...

Unsupervised Metrics (Training Set):
Silhouette Score: 0.081
Davies-Bouldin Index: 2.233

Supervised Metrics (Training Set):
Accuracy: 0.671
Precision: 0.646
Recall: 0.646
F1-score: 0.646

Confusion Matrix (Training Set):
[[37291 16583]
 [16583 30321]]

Evaluating on Validation Set...

Unsupervised Metrics (Validation Set):
Silhouette Score: 0.069
Davies-Bouldin Index: 1.999

Supervised Metrics (Validation Set):
Accuracy: 0.676
Precision: 0.650
Recall: 0.657
F1-score: 0.654

Confusion Matrix (Validation Set):
[[9317 4152]
 [4017 7709]]


In [15]:
# Cell 5: Evaluate on External Test Set
print("\nEvaluating on test set...")
# Load and preprocess test data
df_test = pd.read_csv(processed_test_path)
X_test_scaled = robust_scaler.transform(df_test)
X_test_selected = selector.transform(X_test_scaled)

# Load test labels
y_test = pd.read_csv(test_labels_path)
y_test_binary = (y_test['label'] != 'normal').astype(int)

# Get predictions
test_predictions = iso_forest.predict(X_test_selected)
test_scores = iso_forest.score_samples(X_test_selected)
test_labels = (test_predictions == -1).astype(int)

# Calculate metrics
if len(np.unique(test_labels)) > 1:
    test_silhouette = silhouette_score(X_test_selected, test_labels)
    test_davies_bouldin = davies_bouldin_score(X_test_selected, test_labels)
    print("\nUnsupervised Metrics (Test Set):")
    print(f"Silhouette Score: {test_silhouette:.3f}")
    print(f"Davies-Bouldin Index: {test_davies_bouldin:.3f}")

test_accuracy = accuracy_score(y_test_binary, test_labels)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test_binary, test_labels, average='binary')
test_conf_matrix = confusion_matrix(y_test_binary, test_labels)

print("\nSupervised Metrics (Test Set):")
print(f"Accuracy: {test_accuracy:.3f}")
print(f"Precision: {test_precision:.3f}")
print(f"Recall: {test_recall:.3f}")
print(f"F1-score: {test_f1:.3f}")
print("\nConfusion Matrix (Test Set):")
print(test_conf_matrix)

# Print final distribution summary
print("\nSample Distribution Summary:")
print("\nTraining Set:")
print(f"Normal: {sum(train_labels == 0)}, Anomaly: {sum(train_labels == 1)}")
print("\nValidation Set:")
print(f"Normal: {sum(val_labels == 0)}, Anomaly: {sum(val_labels == 1)}")
print("\nTest Set:")
print(f"Normal: {sum(test_labels == 0)}, Anomaly: {sum(test_labels == 1)}")


Evaluating on test set...

Supervised Metrics (Test Set):
Accuracy: 0.569
Precision: 0.569
Recall: 1.000
F1-score: 0.725

Confusion Matrix (Test Set):
[[    0  9711]
 [    0 12833]]

Sample Distribution Summary:

Training Set:
Normal: 53874, Anomaly: 46904

Validation Set:
Normal: 13334, Anomaly: 11861

Test Set:
Normal: 0, Anomaly: 22544
