In [17]:
# Cell 1: Import libraries and set paths
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import pickle

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_labels.csv'

# Load class names mapping
preprocessing_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/preprocessing_objects.pkl'
with open(preprocessing_path, 'rb') as f:
    preprocessing_objects = pickle.load(f)
    class_names = preprocessing_objects['class_names']

In [18]:
# Cell 2: Load and prepare data
print("Loading training data...")
df_train = pd.read_csv(processed_train_path)
X = df_train.drop('multiclass_label', axis=1).values
y = df_train['multiclass_label'].values

# Split training set and validation set (80-20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display number of samples for each class
class_dist = pd.Series(y).value_counts().sort_index()
print("\nClass distribution in training data:")
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y)*100:.2f}%)")

# Display data information
print("\nDataset shapes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print("Loading complete!")

Loading training data...

Class distribution in training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Dataset shapes:
Training set: (100778, 43)
Validation set: (25195, 43)
Loading complete!


In [19]:
# Cell 3: Training multiclass Isolation Forest models
print("\nTraining multiclass Isolation Forest models...")

# Get number of classes
n_classes = len(np.unique(y))
print(f"Number of classes: {n_classes}")

# Adjust contamination parameters based on the proportion of each class in the data
class_proportions = pd.Series(y_train).value_counts(normalize=True).sort_index()
contaminations = {}
for i in range(n_classes):
    if i == 0:
        contaminations[i] = 0.1
    else:
        contaminations[i] = min(0.5, 1 - class_proportions[i])
        
print("Contamination parameters for each class:")
for i in range(n_classes):
    print(f"Class {i} ({class_names[i]}): {contaminations[i]:.4f}")

# Train an Isolation Forest model for each class
isolation_forests = []

for i in range(n_classes):
    print(f"\nTraining model for class {i} ({class_names[i]})...")
    
    if i == 0:
        current_class_samples = X_train[y_train == i]
        print(f"Using {len(current_class_samples)} normal samples for training")
    else:
        # Select samples from the normal class
        normal_samples = X_train[y_train == 0]
        abnormal_samples = X_train[y_train == i]
        
        # Keep a reasonable ratio between normal and abnormal samples
        n_normal = min(len(normal_samples), max(len(abnormal_samples) * 5, 1000))
        
        # Select normal samples randomly
        normal_indices = np.random.choice(len(normal_samples), n_normal, replace=False)
        selected_normal = normal_samples[normal_indices]
        
        # Combine samples
        current_class_samples = np.vstack([selected_normal, abnormal_samples])
        print(f"Using {len(selected_normal)} normal samples and {len(abnormal_samples)} samples of class {i} for training")
    
    # Train the Isolation Forest model for the current class
    iso_forest = IsolationForest(
        n_estimators=200,
        max_samples='auto',
        contamination=contaminations[i],
        random_state=42,
        n_jobs=-1
    )
    
    # Fit the model
    iso_forest.fit(current_class_samples)
    isolation_forests.append(iso_forest)

print("\nAll models have been trained!")


Training multiclass Isolation Forest models...
Number of classes: 5
Contamination parameters for each class:
Class 0 (Normal Traffic): 0.1000
Class 1 (DOS (Denial of Service)): 0.5000
Class 2 (Probe (Surveillance/Scanning)): 0.5000
Class 3 (R2L (Remote to Local)): 0.5000
Class 4 (U2R (User to Root)): 0.5000

Training model for class 0 (Normal Traffic)...
Using 53874 normal samples for training

Training model for class 1 (DOS (Denial of Service))...
Using 53874 normal samples and 36741 samples of class 1 for training

Training model for class 2 (Probe (Surveillance/Scanning))...
Using 46625 normal samples and 9325 samples of class 2 for training

Training model for class 3 (R2L (Remote to Local))...
Using 3980 normal samples and 796 samples of class 3 for training

Training model for class 4 (U2R (User to Root))...
Using 1000 normal samples and 42 samples of class 4 for training

All models have been trained!


In [20]:
# Cell 4: Evaluate on training set
print("\nEvaluating on training set...")

# Calculate anomaly scores for each class
train_scores = np.zeros((X_train.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    train_scores[:, i] = -isolation_forests[i].score_samples(X_train)  # Negative sign makes lower scores indicate more likely to belong to that class

# Predict the class with the lowest anomaly score
train_predictions = np.argmin(train_scores, axis=1)

# Calculate evaluation metrics
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train, train_predictions, average='macro')
train_report = classification_report(y_train, train_predictions)
train_confusion = confusion_matrix(y_train, train_predictions)

print(f"\nTraining set accuracy: {train_accuracy:.4f}")
print(f"Macro-average precision: {train_precision:.4f}")
print(f"Macro-average recall: {train_recall:.4f}")
print(f"Macro-average F1-score: {train_f1:.4f}")
print("\nClassification Report (Training Set):")
print(train_report)
print("\nConfusion Matrix (Training Set):")
print(train_confusion)


Evaluating on training set...

Training set accuracy: 0.4310
Macro-average precision: 0.5521
Macro-average recall: 0.5789
Macro-average F1-score: 0.3768

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.99      0.02      0.04     53874
           1       0.97      0.91      0.94     36741
           2       0.75      0.91      0.82      9325
           3       0.05      0.90      0.09       796
           4       0.00      0.17      0.00        42

    accuracy                           0.43    100778
   macro avg       0.55      0.58      0.38    100778
weighted avg       0.95      0.43      0.44    100778


Confusion Matrix (Training Set):
[[ 1010   417  2165 14348 35934]
 [    1 33252   555   599  2334]
 [    3   517  8447    16   342]
 [    3     7    46   715    25]
 [    4     3     0    28     7]]


In [21]:
# Cell 5: Evaluate on validation set
print("\nEvaluating on validation set...")

# Calculate anomaly scores for each class
val_scores = np.zeros((X_val.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    val_scores[:, i] = -isolation_forests[i].score_samples(X_val)  # Negative sign makes lower scores indicate more likely to belong to that class

# Predict the class with the lowest anomaly score
val_predictions = np.argmin(val_scores, axis=1)

# Calculate evaluation metrics
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_predictions, average='macro')
val_report = classification_report(y_val, val_predictions)
val_confusion = confusion_matrix(y_val, val_predictions)

print(f"\nValidation set accuracy: {val_accuracy:.4f}")
print(f"Macro-average precision: {val_precision:.4f}")
print(f"Macro-average recall: {val_recall:.4f}")
print(f"Macro-average F1-score: {val_f1:.4f}")
print("\nClassification Report (Validation Set):")
print(val_report)
print("\nConfusion Matrix (Validation Set):")
print(val_confusion)


Evaluating on validation set...

Validation set accuracy: 0.4282
Macro-average precision: 0.5525
Macro-average recall: 0.5458
Macro-average F1-score: 0.3762

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.99      0.02      0.04     13469
           1       0.97      0.90      0.93      9186
           2       0.76      0.91      0.82      2331
           3       0.05      0.90      0.09       199
           4       0.00      0.00      0.00        10

    accuracy                           0.43     25195
   macro avg       0.55      0.55      0.38     25195
weighted avg       0.95      0.43      0.44     25195


Confusion Matrix (Validation Set):
[[ 241  107  527 3621 8973]
 [   1 8251  149  153  632]
 [   0  112 2117    3   99]
 [   1    1    9  180    8]
 [   1    0    1    8    0]]


In [22]:
# Cell 6: Evaluate on test set
print("\nEvaluating on test set...")

# Load test data
df_test = pd.read_csv(processed_test_path)
X_test = df_test.drop('multiclass_label', axis=1).values
y_test = df_test['multiclass_label'].values

# Display distribution of classes in test data
test_class_dist = pd.Series(y_test).value_counts().sort_index()
print("\nClass distribution in test data:")
for class_id, count in test_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_test)*100:.2f}%)")

# Calculate anomaly scores for each class
test_scores = np.zeros((X_test.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    test_scores[:, i] = -isolation_forests[i].score_samples(X_test)

# Predict the class with the lowest anomaly score
test_predictions = np.argmin(test_scores, axis=1)

# Calculate evaluation metrics
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, test_predictions, average='macro')
test_report = classification_report(y_test, test_predictions)
test_confusion = confusion_matrix(y_test, test_predictions)

print(f"\nTest set accuracy: {test_accuracy:.4f}")
print(f"Macro-average precision: {test_precision:.4f}")
print(f"Macro-average recall: {test_recall:.4f}")
print(f"Macro-average F1-score: {test_f1:.4f}")
print("\nClassification Report (Test Set):")
print(test_report)
print("\nConfusion Matrix (Test Set):")
print(test_confusion)


Evaluating on test set...

Class distribution in test data:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)

Test set accuracy: 0.3986
Macro-average precision: 0.4855
Macro-average recall: 0.3742
Macro-average F1-score: 0.3064

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       1.00      0.02      0.05      9711
           1       0.63      0.85      0.72      7458
           2       0.66      0.46      0.54      2421
           3       0.13      0.45      0.21      2887
           4       0.01      0.09      0.01        67

    accuracy                           0.40     22544
   macro avg       0.49      0.37      0.31     22544
weighted avg       0.73      0.40      0.34     22544


Confusion Matrix (Test 