In [16]:
# Cell 1: Import Libraries and Setup Paths
import numpy as np
import pandas as pd
import cupy as cp
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle
import time

# Define paths
processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_processed.csv'
train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_labels.csv'
test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_labels.csv'

# Load class names mapping
preprocessing_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/preprocessing_objects.pkl'
with open(preprocessing_path, 'rb') as f:
    preprocessing_objects = pickle.load(f)
    class_names = preprocessing_objects['class_names']

In [17]:
# Cell 2: Load and Prepare Data
print("Loading training data...")
df_train = pd.read_csv(processed_train_path)
X = df_train.drop('multiclass_label', axis=1).values
y = df_train['multiclass_label'].values

# Split training set and validation set (80-20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display number of samples for each class
class_dist = pd.Series(y).value_counts().sort_index()
print("\nClass distribution in training data:")
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y)*100:.2f}%)")

# Display data information
print("\nDataset shapes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print("Loading complete!")

Loading training data...

Class distribution in training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Dataset shapes:
Training set: (100778, 43)
Validation set: (25195, 43)
Loading complete!


In [18]:
# Cell 3: Training multiclass One-Class SVM models with improved parameters and balanced samples
print("\nTraining multiclass One-Class SVM models with improved parameters and balanced samples...")

# Get number of classes
n_classes = len(np.unique(y))
print(f"Number of classes: {n_classes}")

# Adjust nu parameters based on the improved approach from IF model
nu_params = {
    0: 0.35,  # Increased from 0.1 to improve normal traffic detection
    1: 0.15,  # Slightly reduced from 0.2
    2: 0.15,  # Slightly reduced from 0.2
    3: 0.15,  # Slightly reduced from 0.2
    4: 0.10   # Significantly reduced from 0.2 to prevent over-prediction
}
        
print("Improved nu parameters for each class:")
for i in range(n_classes):
    print(f"Class {i} ({class_names[i]}): {nu_params[i]:.4f}")

# Balance training samples using SMOTE for minority classes only
print("\nBalancing training data using SMOTE for minority classes...")

# Get original class counts
original_counts = {}
for i in range(n_classes):
    original_counts[i] = len(X_train[y_train == i])
    print(f"Original count for Class {i} ({class_names[i]}): {original_counts[i]}")

# Define sampling strategy to balance classes
sampling_strategy = {}
for i in range(n_classes):
    if i == 3:  # R2L
        # Increase R2L samples 5x but cap at 1/5 of normal samples
        sampling_strategy[i] = min(original_counts[0]//5, original_counts[i] * 5)
    elif i == 4:  # U2R
        # Increase U2R samples 10x but cap at 1/10 of normal samples
        sampling_strategy[i] = min(original_counts[0]//10, original_counts[i] * 10)
    else:
        # Keep other classes unchanged by not including them in the strategy
        # SMOTE will only oversample classes specified in the strategy
        pass

print("\nSMOTE sampling strategy for minority classes:")
for class_id, target_count in sampling_strategy.items():
    print(f"Class {class_id} ({class_names[class_id]}): {original_counts[class_id]} → {target_count} samples")

# Apply SMOTE only to minority classes
if sampling_strategy:  # Only apply if we have classes to oversample
    smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Display resampled class distribution
    resampled_class_dist = pd.Series(y_resampled).value_counts().sort_index()
    print("\nResampled class distribution:")
    for class_id, count in resampled_class_dist.items():
        print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_resampled)*100:.2f}%)")
else:
    # If no oversampling needed, use original data
    X_resampled, y_resampled = X_train, y_train
    print("\nNo oversampling applied, using original data.")

# Train a One-Class SVM model for each class with improved approach
oc_svms = []
training_times = []

for i in range(n_classes):
    print(f"\nTraining model for class {i} ({class_names[i]})...")
    
    if i == 0:
        # For normal class, use all normal samples
        current_class_samples = X_resampled[y_resampled == i]
        print(f"Using {len(current_class_samples)} normal samples for training")
    else:
        # For attack classes, use a balanced approach:
        # 1. Select samples from the normal class
        # 2. Select samples from the current attack class
        # 3. Use a more balanced ratio between normal and attack samples
        
        normal_samples = X_resampled[y_resampled == 0]
        attack_samples = X_resampled[y_resampled == i]
        
        # Use a more balanced ratio between normal and attack samples
        if i == 1:  # DOS (majority attack class)
            # For DOS, use fewer normal samples to balance
            ratio = 0.5  # Use 1:2 ratio (normal:DOS)
            n_normal = min(len(normal_samples), int(len(attack_samples) * ratio))
        elif i == 4:  # U2R (most imbalanced class)
            ratio = 1.0  # Use 1:1 ratio for U2R
            n_normal = min(len(normal_samples), int(len(attack_samples) * ratio))
        else:
            ratio = 2.0  # Use 2:1 ratio for other attack types
            n_normal = min(len(normal_samples), int(len(attack_samples) * ratio))
            
        # Select normal samples randomly
        normal_indices = np.random.choice(len(normal_samples), n_normal, replace=False)
        selected_normal = normal_samples[normal_indices]
        
        # Combine samples
        current_class_samples = np.vstack([selected_normal, attack_samples])
        print(f"Using {len(selected_normal)} normal samples and {len(attack_samples)} samples of class {i} for training")
        print(f"Normal to attack ratio: {len(selected_normal)/len(attack_samples):.1f}:1")
    
    # Train the One-Class SVM model for the current class with improved parameters
    start_time = time.time()
    oc_svm = OneClassSVM(
        kernel='rbf',
        nu=nu_params[i],
        gamma='scale',
        cache_size=1000,  # Increased cache size for better performance
        verbose=False
    )
    
    # Fit the model
    oc_svm.fit(current_class_samples)
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)
    
    print(f"Training completed in {training_time:.2f} seconds")
    oc_svms.append(oc_svm)

print(f"\nAll models have been trained with improved parameters and balanced samples! Total training time: {sum(training_times):.2f} seconds")


Training multiclass One-Class SVM models with improved parameters and balanced samples...
Number of classes: 5
Improved nu parameters for each class:
Class 0 (Normal Traffic): 0.3500
Class 1 (DOS (Denial of Service)): 0.1500
Class 2 (Probe (Surveillance/Scanning)): 0.1500
Class 3 (R2L (Remote to Local)): 0.1500
Class 4 (U2R (User to Root)): 0.1000

Balancing training data using SMOTE for minority classes...
Original count for Class 0 (Normal Traffic): 53874
Original count for Class 1 (DOS (Denial of Service)): 36741
Original count for Class 2 (Probe (Surveillance/Scanning)): 9325
Original count for Class 3 (R2L (Remote to Local)): 796
Original count for Class 4 (U2R (User to Root)): 42

SMOTE sampling strategy for minority classes:
Class 3 (R2L (Remote to Local)): 796 → 3980 samples
Class 4 (U2R (User to Root)): 42 → 420 samples

Resampled class distribution:
Class 0 (Normal Traffic): 53874 samples (51.63%)
Class 1 (DOS (Denial of Service)): 36741 samples (35.21%)
Class 2 (Probe (Surv

In [19]:
# Cell 4: Evaluate on Training Set
print("\nEvaluating on training set...")

# Calculate anomaly scores for each class
train_scores = np.zeros((X_train.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    train_scores[:, i] = -oc_svms[i].score_samples(X_train)  # Negative sign makes lower scores indicate more likely to belong to that class

# Predict the class with the lowest anomaly score
train_predictions = np.argmin(train_scores, axis=1)

# Calculate evaluation metrics
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train, train_predictions, average='macro')
train_report = classification_report(y_train, train_predictions)
train_confusion = confusion_matrix(y_train, train_predictions)

print(f"\nTraining set accuracy: {train_accuracy:.4f}")
print(f"Macro-average precision: {train_precision:.4f}")
print(f"Macro-average recall: {train_recall:.4f}")
print(f"Macro-average F1-score: {train_f1:.4f}")
print("\nClassification Report (Training Set):")
print(train_report)
print("\nConfusion Matrix (Training Set):")
print(train_confusion)


Evaluating on training set...

Training set accuracy: 0.5559
Macro-average precision: 0.6493
Macro-average recall: 0.3444
Macro-average F1-score: 0.3292

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.55      0.99      0.70     53874
           1       0.99      0.02      0.04     36741
           2       0.99      0.17      0.29      9325
           3       0.40      0.35      0.37       796
           4       0.32      0.19      0.24        42

    accuracy                           0.56    100778
   macro avg       0.65      0.34      0.33    100778
weighted avg       0.75      0.56      0.42    100778


Confusion Matrix (Training Set):
[[53418     7    14   419    16]
 [36024   717     0     0     0]
 [ 7728     0  1597     0     0]
 [  517     0     0   278     1]
 [   33     0     0     1     8]]


In [20]:
# Cell 5: Evaluate on Validation Set
print("\nEvaluating on validation set...")

# Calculate anomaly scores for each class
val_scores = np.zeros((X_val.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    val_scores[:, i] = -oc_svms[i].score_samples(X_val)  # Negative sign makes lower scores indicate more likely to belong to that class

# Predict the class with the lowest anomaly score
val_predictions = np.argmin(val_scores, axis=1)

# Calculate evaluation metrics
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_val, val_predictions, average='macro')
val_report = classification_report(y_val, val_predictions)
val_confusion = confusion_matrix(y_val, val_predictions)

print(f"\nValidation set accuracy: {val_accuracy:.4f}")
print(f"Macro-average precision: {val_precision:.4f}")
print(f"Macro-average recall: {val_recall:.4f}")
print(f"Macro-average F1-score: {val_f1:.4f}")
print("\nClassification Report (Validation Set):")
print(val_report)
print("\nConfusion Matrix (Validation Set):")
print(val_confusion)


Evaluating on validation set...

Validation set accuracy: 0.5550
Macro-average precision: 0.6331
Macro-average recall: 0.3765
Macro-average F1-score: 0.3402

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.55      0.99      0.70     13469
           1       1.00      0.02      0.04      9186
           2       0.99      0.17      0.29      2331
           3       0.36      0.40      0.38       199
           4       0.27      0.30      0.29        10

    accuracy                           0.55     25195
   macro avg       0.63      0.38      0.34     25195
weighted avg       0.75      0.55      0.42     25195


Confusion Matrix (Validation Set):
[[13314     0     5   142     8]
 [ 9001   185     0     0     0]
 [ 1929     0   401     1     0]
 [  119     0     0    80     0]
 [    7     0     0     0     3]]


In [21]:
# Cell 6: Evaluate on Test Set
print("\nEvaluating on test set...")

# Load test data
df_test = pd.read_csv(processed_test_path)
X_test = df_test.drop('multiclass_label', axis=1).values
y_test = df_test['multiclass_label'].values

# Display distribution of classes in test data
test_class_dist = pd.Series(y_test).value_counts().sort_index()
print("\nClass distribution in test data:")
for class_id, count in test_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_test)*100:.2f}%)")

# Calculate anomaly scores for each class
test_scores = np.zeros((X_test.shape[0], n_classes))

for i in range(n_classes):
    # Calculate anomaly scores for the i-th class
    test_scores[:, i] = -oc_svms[i].score_samples(X_test)

# Predict the class with the lowest anomaly score
test_predictions = np.argmin(test_scores, axis=1)

# Calculate evaluation metrics
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, test_predictions, average='macro')
test_report = classification_report(y_test, test_predictions)
test_confusion = confusion_matrix(y_test, test_predictions)

print(f"\nTest set accuracy: {test_accuracy:.4f}")
print(f"Macro-average precision: {test_precision:.4f}")
print(f"Macro-average recall: {test_recall:.4f}")
print(f"Macro-average F1-score: {test_f1:.4f}")
print("\nClassification Report (Test Set):")
print(test_report)
print("\nConfusion Matrix (Test Set):")
print(test_confusion)


Evaluating on test set...

Class distribution in test data:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)

Test set accuracy: 0.4335
Macro-average precision: 0.4252
Macro-average recall: 0.3201
Macro-average F1-score: 0.2372

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.43      0.99      0.60      9711
           1       0.18      0.00      0.00      7458
           2       0.90      0.04      0.08      2421
           3       0.17      0.00      0.00      2887
           4       0.45      0.57      0.50        67

    accuracy                           0.43     22544
   macro avg       0.43      0.32      0.24     22544
weighted avg       0.36      0.43      0.27     22544


Confusion Matrix (Test 