In [3]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_labels.csv'

In [4]:
# Cell 2: Load and Prepare Data
print("Loading training data...")
# Load features
df_train = pd.read_csv(processed_train_path)
X = df_train.values

# Load labels
y_train = pd.read_csv(train_labels_path)
# Convert labels to binary (normal: 0, attack: 1)
y_train_binary = (y_train['label'] != 'normal').astype(int)

# Split into training and validation sets (80-20)
X_train, X_val, y_train_split, y_val = train_test_split(
    X, y_train_binary, test_size=0.2, random_state=42, stratify=y_train_binary
)

# Display data info
print("Dataset shapes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print("Loading complete!")

Loading training data...


Dataset shapes:
Training set: (100778, 43)
Validation set: (25195, 43)
Loading complete!


In [5]:
# Cell 3: Initialize and Train Model
print("\nTraining Isolation Forest model...")
iso_forest = IsolationForest(
    n_estimators=100,  # Increase number of trees for better performance
    max_samples='auto',  # Automatically select the number of samples
    contamination=0.35,  # Set based on expected proportion of anomalies
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Fit the model on training set only
iso_forest.fit(X_train)


Training Isolation Forest model...


In [6]:
# Cell 4: Evaluate on Training and Validation Sets
# Training Set Evaluation
print("\nEvaluating on Training Set...")
train_predictions = iso_forest.predict(X_train)
train_scores = iso_forest.score_samples(X_train)
train_labels = (train_predictions == -1).astype(int)

# Calculate training metrics
train_silhouette = silhouette_score(X_train, train_labels)
train_davies_bouldin = davies_bouldin_score(X_train, train_labels)

print("\nUnsupervised Evaluation Metrics (Training Set):")
print(f"Silhouette Score: {train_silhouette:.3f}")
print(f"Davies-Bouldin Index: {train_davies_bouldin:.3f}")

train_accuracy = accuracy_score(y_train_split, train_labels)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train_split, train_labels, average='binary')
train_conf_matrix = confusion_matrix(y_train_split, train_labels)
train_class_report = classification_report(y_train_split, train_labels)

print("\nSupervised Evaluation Metrics (Training Set):")
print(f"Accuracy: {train_accuracy:.3f}")
print(f"Precision: {train_precision:.3f}")
print(f"Recall: {train_recall:.3f}")
print(f"F1-score: {train_f1:.3f}")
print("\nConfusion Matrix (Training Set):")
print(train_conf_matrix)
print("\nClassification Report (Training Set):")
print(train_class_report)

# Validation Set Evaluation
print("\nEvaluating on Validation Set...")
val_predictions = iso_forest.predict(X_val)
val_scores = iso_forest.score_samples(X_val)
val_labels = (val_predictions == -1).astype(int)

# Calculate validation metrics
val_silhouette = silhouette_score(X_val, val_labels)
val_davies_bouldin = davies_bouldin_score(X_val, val_labels)

print("\nUnsupervised Evaluation Metrics (Validation Set):")
print(f"Silhouette Score: {val_silhouette:.3f}")
print(f"Davies-Bouldin Index: {val_davies_bouldin:.3f}")

val_accuracy = accuracy_score(y_val, val_labels)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_labels, average='binary')
val_conf_matrix = confusion_matrix(y_val, val_labels)
val_class_report = classification_report(y_val, val_labels)

print("\nSupervised Evaluation Metrics (Validation Set):")
print(f"Accuracy: {val_accuracy:.3f}")
print(f"Precision: {val_precision:.3f}")
print(f"Recall: {val_recall:.3f}")
print(f"F1-score: {val_f1:.3f}")
print("\nConfusion Matrix (Validation Set):")
print(val_conf_matrix)
print("\nClassification Report (Validation Set):")
print(val_class_report)


Evaluating on Training Set...

Unsupervised Evaluation Metrics (Training Set):
Silhouette Score: 0.218
Davies-Bouldin Index: 3.279

Supervised Evaluation Metrics (Training Set):
Accuracy: 0.635
Precision: 0.644
Recall: 0.484
F1-score: 0.553

Confusion Matrix (Training Set):
[[41305 12569]
 [24201 22703]]

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.63      0.77      0.69     53874
           1       0.64      0.48      0.55     46904

    accuracy                           0.64    100778
   macro avg       0.64      0.63      0.62    100778
weighted avg       0.64      0.64      0.63    100778


Evaluating on Validation Set...

Unsupervised Evaluation Metrics (Validation Set):
Silhouette Score: 0.215
Davies-Bouldin Index: 3.350

Supervised Evaluation Metrics (Validation Set):
Accuracy: 0.639
Precision: 0.647
Recall: 0.495
F1-score: 0.561

Confusion Matrix (Validation Set):
[[10297  3172]
 [ 5918  5808]]

Classificat

In [7]:
# Cell 5: Evaluate on Test Set
print("\nEvaluating on test set...")
# Load test data
df_test = pd.read_csv(processed_test_path)
X_test = df_test.values

# Load test labels
y_test = pd.read_csv(test_labels_path)
y_test_binary = (y_test['label'] != 'normal').astype(int)

# Get predictions for test data
raw_predictions_test = iso_forest.predict(X_test)
scores_test = iso_forest.score_samples(X_test)
labels_test = (raw_predictions_test == -1).astype(int)

# Calculate unsupervised metrics for test set
silhouette_avg_test = silhouette_score(X_test, labels_test)
davies_bouldin_idx_test = davies_bouldin_score(X_test, labels_test)

print("\nUnsupervised Evaluation Metrics (Test Set):")
print(f"Silhouette Score: {silhouette_avg_test:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin_idx_test:.3f}")

# Calculate supervised metrics for test set
accuracy_test = accuracy_score(y_test_binary, labels_test)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test_binary, labels_test, average='binary')
conf_matrix_test = confusion_matrix(y_test_binary, labels_test)
class_report_test = classification_report(y_test_binary, labels_test)

print("\nSupervised Evaluation Metrics (Test Set):")
print(f"Accuracy: {accuracy_test:.3f}")
print(f"Precision: {precision_test:.3f}")
print(f"Recall: {recall_test:.3f}")
print(f"F1-score: {f1_test:.3f}")
print("\nConfusion Matrix:")
print(conf_matrix_test)
print("\nClassification Report:")
print(class_report_test)


Evaluating on test set...

Unsupervised Evaluation Metrics (Test Set):
Silhouette Score: 0.219
Davies-Bouldin Index: 0.713

Supervised Evaluation Metrics (Test Set):
Accuracy: 0.557
Precision: 0.564
Recall: 0.979
F1-score: 0.716

Confusion Matrix:
[[    0  9711]
 [  268 12565]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      9711
           1       0.56      0.98      0.72     12833

    accuracy                           0.56     22544
   macro avg       0.28      0.49      0.36     22544
weighted avg       0.32      0.56      0.41     22544

