In [6]:
# Cell 1: Import Libraries and Setup Paths
import numpy as np
import pandas as pd
import cupy as cp
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import pickle
import time

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_labels.csv'

# Load class names mapping
preprocessing_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/preprocessing_objects.pkl'
with open(preprocessing_path, 'rb') as f:
    preprocessing_objects = pickle.load(f)
    class_names = preprocessing_objects['class_names']


In [7]:
# Cell 2: Load and Prepare Data
print("Loading training data...")
df_train = pd.read_csv(processed_train_path)
X = df_train.drop('multiclass_label', axis=1).values
y = df_train['multiclass_label'].values

# Split training set and validation set (80-20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display number of samples for each class
class_dist = pd.Series(y).value_counts().sort_index()
print("\nClass distribution in training data:")
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y)*100:.2f}%)")

# Display data information
print("\nDataset shapes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print("Loading complete!")

Loading training data...

Class distribution in training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Dataset shapes:
Training set: (100778, 43)
Validation set: (25195, 43)
Loading complete!


In [8]:
# Cell 3: Training multiclass One-Class SVM models
print("\nTraining multiclass One-Class SVM models...")

# Get number of classes
n_classes = len(np.unique(y))
print(f"Number of classes: {n_classes}")

# Adjust nu parameters based on the proportion of each class in the data
# nu is an upper bound on the fraction of training errors and a lower bound on the fraction of support vectors
class_proportions = pd.Series(y_train).value_counts(normalize=True).sort_index()
nu_params = {}
for i in range(n_classes):
    if i == 0:
        nu_params[i] = 0.1  # Normal traffic class
    else:
        nu_params[i] = min(0.2, 1 - class_proportions[i])
        
print("Nu parameters for each class:")
for i in range(n_classes):
    print(f"Class {i} ({class_names[i]}): {nu_params[i]:.4f}")

# Train a One-Class SVM model for each class
oc_svms = []
training_times = []

for i in range(n_classes):
    print(f"\nTraining model for class {i} ({class_names[i]})...")
    
    if i == 0:
        # For normal traffic, train only on normal samples
        current_class_samples = X_train[y_train == i]
        print(f"Using {len(current_class_samples)} normal samples for training")
    else:
        # For anomaly classes, train on normal samples and samples of the current class
        normal_samples = X_train[y_train == 0]
        abnormal_samples = X_train[y_train == i]
        
        # Keep a reasonable ratio between normal and abnormal samples
        n_normal = min(len(normal_samples), max(len(abnormal_samples) * 5, 1000))
        
        # Select normal samples randomly
        normal_indices = np.random.choice(len(normal_samples), n_normal, replace=False)
        selected_normal = normal_samples[normal_indices]
        
        # Combine samples
        current_class_samples = np.vstack([selected_normal, abnormal_samples])
        print(f"Using {len(selected_normal)} normal samples and {len(abnormal_samples)} samples of class {i} for training")
    
    # Train the One-Class SVM model for the current class
    start_time = time.time()
    oc_svm = OneClassSVM(
        kernel='rbf',
        nu=nu_params[i],
        gamma='scale',
        cache_size=500,
        verbose=False
    )
    
    # Fit the model
    oc_svm.fit(current_class_samples)
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)
    
    print(f"Training completed in {training_time:.2f} seconds")
    oc_svms.append(oc_svm)

print(f"\nAll models have been trained! Total training time: {sum(training_times):.2f} seconds")


Training multiclass One-Class SVM models...
Number of classes: 5
Nu parameters for each class:
Class 0 (Normal Traffic): 0.1000
Class 1 (DOS (Denial of Service)): 0.2000
Class 2 (Probe (Surveillance/Scanning)): 0.2000
Class 3 (R2L (Remote to Local)): 0.2000
Class 4 (U2R (User to Root)): 0.2000

Training model for class 0 (Normal Traffic)...
Using 53874 normal samples for training
Training completed in 61.21 seconds

Training model for class 1 (DOS (Denial of Service))...
Using 53874 normal samples and 36741 samples of class 1 for training
Training completed in 581.59 seconds

Training model for class 2 (Probe (Surveillance/Scanning))...
Using 46625 normal samples and 9325 samples of class 2 for training
Training completed in 143.18 seconds

Training model for class 3 (R2L (Remote to Local))...
Using 3980 normal samples and 796 samples of class 3 for training
Training completed in 0.56 seconds

Training model for class 4 (U2R (User to Root))...
Using 1000 normal samples and 42 samples 

In [9]:
# Cell 4: Evaluate on Training and Validation Sets
print("\nEvaluating on training set...")

# Calculate anomaly scores for each class
train_scores = np.zeros((X_train.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    train_scores[:, i] = -oc_svms[i].score_samples(X_train)

# Predict the class with the lowest anomaly score
train_predictions = np.argmin(train_scores, axis=1)

# Calculate evaluation metrics
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train, train_predictions, average='macro')
train_report = classification_report(y_train, train_predictions)
train_confusion = confusion_matrix(y_train, train_predictions)

print(f"\nTraining set accuracy: {train_accuracy:.4f}")
print(f"Macro-average precision: {train_precision:.4f}")
print(f"Macro-average recall: {train_recall:.4f}")
print(f"Macro-average F1-score: {train_f1:.4f}")
print("\nClassification Report (Training Set):")
print(train_report)
print("\nConfusion Matrix (Training Set):")
print(train_confusion)

# Evaluate on validation set
print("\nEvaluating on validation set...")

# Calculate anomaly scores for each class
val_scores = np.zeros((X_val.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    val_scores[:, i] = -oc_svms[i].score_samples(X_val)

# Predict the class with the lowest anomaly score
val_predictions = np.argmin(val_scores, axis=1)

# Calculate evaluation metrics
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_predictions, average='macro')
val_report = classification_report(y_val, val_predictions)
val_confusion = confusion_matrix(y_val, val_predictions)

print(f"\nValidation set accuracy: {val_accuracy:.4f}")
print(f"Macro-average precision: {val_precision:.4f}")
print(f"Macro-average recall: {val_recall:.4f}")
print(f"Macro-average F1-score: {val_f1:.4f}")
print("\nClassification Report (Validation Set):")
print(val_report)
print("\nConfusion Matrix (Validation Set):")
print(val_confusion)


Evaluating on training set...

Training set accuracy: 0.4244
Macro-average precision: 0.7530
Macro-average recall: 0.3560
Macro-average F1-score: 0.3164

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.60      0.01      0.01     53874
           1       0.39      1.00      0.56     36741
           2       0.83      0.61      0.70      9325
           3       0.95      0.07      0.13       796
           4       1.00      0.10      0.17        42

    accuracy                           0.42    100778
   macro avg       0.75      0.36      0.32    100778
weighted avg       0.55      0.42      0.28    100778


Confusion Matrix (Training Set):
[[  319 52458  1094     3     0]
 [    0 36697    44     0     0]
 [    0  3628  5697     0     0]
 [  202   501    38    55     0]
 [   15    18     5     0     4]]

Evaluating on validation set...

Validation set accuracy: 0.4243
Macro-average precision: 0.6629
Macro-average recall

In [10]:
# Cell 5: Evaluate on Test Set
print("\nEvaluating on test set...")

# Load test data
df_test = pd.read_csv(processed_test_path)
X_test = df_test.drop('multiclass_label', axis=1).values
y_test = df_test['multiclass_label'].values

# Display distribution of classes in test data
test_class_dist = pd.Series(y_test).value_counts().sort_index()
print("\nClass distribution in test data:")
for class_id, count in test_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_test)*100:.2f}%)")

# Calculate anomaly scores for each class
test_scores = np.zeros((X_test.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    test_scores[:, i] = -oc_svms[i].score_samples(X_test)

# Predict the class with the lowest anomaly score
test_predictions = np.argmin(test_scores, axis=1)

# Calculate evaluation metrics
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, test_predictions, average='macro')
test_report = classification_report(y_test, test_predictions)
test_confusion = confusion_matrix(y_test, test_predictions)

print(f"\nTest set accuracy: {test_accuracy:.4f}")
print(f"Macro-average precision: {test_precision:.4f}")
print(f"Macro-average recall: {test_recall:.4f}")
print(f"Macro-average F1-score: {test_f1:.4f}")
print("\nClassification Report (Test Set):")
print(test_report)
print("\nConfusion Matrix (Test Set):")
print(test_confusion)


Evaluating on test set...

Class distribution in test data:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)

Test set accuracy: 0.3660
Macro-average precision: 0.6031
Macro-average recall: 0.2927
Macro-average F1-score: 0.2416

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.71      0.00      0.00      9711
           1       0.35      1.00      0.52      7458
           2       0.70      0.33      0.45      2421
           3       0.26      0.00      0.01      2887
           4       1.00      0.13      0.24        67

    accuracy                           0.37     22544
   macro avg       0.60      0.29      0.24     22544
weighted avg       0.53      0.37      0.22     22544


Confusion Matrix (Test 