In [1]:
# Cell 1: Import Libraries and Setup Paths
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
import pickle
import time
import os

# Define paths for binary classification (OneClassSVM)
bin_processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_processed.csv'
bin_processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_processed.csv'
bin_train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTrain_labels.csv'
bin_test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/bin/KDDTest_labels.csv'

# Define paths for multi-class classification (XGBoost)
multi_processed_train_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
multi_processed_test_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_processed.csv'
multi_train_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTrain_labels.csv'
multi_test_labels_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/KDDTest_labels.csv'

# Load class names mapping
preprocessing_path = '/root/autodl-tmp/projects/USL_NSL/dataset/processed/multi/preprocessing_objects.pkl'
with open(preprocessing_path, 'rb') as f:
    preprocessing_objects = pickle.load(f)
    class_names = preprocessing_objects['class_names']

In [2]:
# Cell 2: Load and Prepare Binary Data for OneClassSVM
print("Loading binary classification data for OneClassSVM...")
# Load binary features and labels
df_bin_train = pd.read_csv(bin_processed_train_path)
X_bin = df_bin_train.values

y_bin_train = pd.read_csv(bin_train_labels_path)
y_bin_train_binary = y_bin_train['label'].values

# Feature scaling for binary data
bin_scaler = StandardScaler()
X_bin_scaled = bin_scaler.fit_transform(X_bin)

# Handle class imbalance using SMOTE for binary classification
print("Applying SMOTE to handle class imbalance for binary data...")
bin_smote = SMOTE(random_state=42)
X_bin_resampled, y_bin_resampled = bin_smote.fit_resample(X_bin_scaled, y_bin_train_binary)

# Split binary data into training and validation sets
X_bin_train, X_bin_val, y_bin_train_split, y_bin_val = train_test_split(
    X_bin_resampled, y_bin_resampled, test_size=0.2, random_state=42
)

print(f"Binary training set shape: {X_bin_train.shape}")
print(f"Binary validation set shape: {X_bin_val.shape}")
print("Binary data loading complete!")

Loading binary classification data for OneClassSVM...
Applying SMOTE to handle class imbalance for binary data...
Binary training set shape: (107748, 43)
Binary validation set shape: (26938, 43)
Binary data loading complete!


In [3]:
# Cell 3: Load and Prepare Multi-class Data for XGBoost
print("\nLoading multi-class data for XGBoost...")
# Load multi-class features and labels
df_multi_train = pd.read_csv(multi_processed_train_path)
X_multi = df_multi_train.drop('multiclass_label', axis=1).values
y_multi = df_multi_train['multiclass_label'].values

# Create binary labels (0: normal, 1: attack) for reference
y_multi_binary = np.where(y_multi == 0, 0, 1)

# Split multi-class data into training and validation sets
X_multi_train, X_multi_val, y_multi_train, y_multi_val = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)
_, _, y_multi_binary_train, y_multi_binary_val = train_test_split(
    X_multi, y_multi_binary, test_size=0.2, random_state=42, stratify=y_multi_binary
)

# Feature scaling for multi-class data
multi_scaler = StandardScaler()
X_multi_train_scaled = multi_scaler.fit_transform(X_multi_train)
X_multi_val_scaled = multi_scaler.transform(X_multi_val)

# Display number of samples for each class
class_dist = pd.Series(y_multi).value_counts().sort_index()
print("\nClass distribution in multi-class training data:")
for class_id, count in class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_multi)*100:.2f}%)")

print(f"\nMulti-class training set shape: {X_multi_train.shape}")
print(f"Multi-class validation set shape: {X_multi_val.shape}")
print("Multi-class data loading complete!")


Loading multi-class data for XGBoost...

Class distribution in multi-class training data:
Class 0 (Normal Traffic): 67343 samples (53.46%)
Class 1 (DOS (Denial of Service)): 45927 samples (36.46%)
Class 2 (Probe (Surveillance/Scanning)): 11656 samples (9.25%)
Class 3 (R2L (Remote to Local)): 995 samples (0.79%)
Class 4 (U2R (User to Root)): 52 samples (0.04%)

Multi-class training set shape: (100778, 43)
Multi-class validation set shape: (25195, 43)
Multi-class data loading complete!


In [4]:
# Cell 4: Train OneClassSVM for Anomaly Detection (Using Binary Data)
print("\nTraining OneClassSVM for anomaly detection using binary data...")

# Train OneClassSVM with optimized parameters
start_time = time.time()
anomaly_detector = OneClassSVM(
    kernel='rbf',
    nu=0.35,           # Parameter from the successful binary model
    gamma='scale',     # Kernel coefficient
    cache_size=1000,   # Increased cache size
    max_iter=1000,     # Increased max iterations
    tol=1e-4           # Set a smaller tolerance for convergence
)

# Train on the binary training data
anomaly_detector.fit(X_bin_train)
anomaly_train_time = time.time() - start_time
print(f"OneClassSVM training completed in {anomaly_train_time:.2f} seconds")

# Function to convert OneClassSVM predictions to binary labels
def convert_predictions(predictions):
    return np.where(predictions == 1, 0, 1)

# Evaluate anomaly detector on binary validation set
anomaly_val_pred_raw = anomaly_detector.predict(X_bin_val)
anomaly_val_pred = convert_predictions(anomaly_val_pred_raw)

# Calculate evaluation metrics
anomaly_val_accuracy = accuracy_score(y_bin_val, anomaly_val_pred)
anomaly_val_precision, anomaly_val_recall, anomaly_val_f1, _ = precision_recall_fscore_support(
    y_bin_val, anomaly_val_pred, average='binary'
)

# Calculate unsupervised evaluation metrics
try:
    silhouette_avg = silhouette_score(X_bin_val, anomaly_val_pred)
    davies_bouldin_idx = davies_bouldin_score(X_bin_val, anomaly_val_pred)
    print("\nUnsupervised Evaluation Metrics:")
    print(f"Silhouette Score: {silhouette_avg:.3f}")
    print(f"Davies-Bouldin Index: {davies_bouldin_idx:.3f}")
except:
    print("Could not calculate unsupervised metrics due to single class prediction")

# Print prediction distribution
print("\nPrediction distribution (Binary Validation Set):")
unique, counts = np.unique(anomaly_val_pred, return_counts=True)
print(dict(zip(unique, counts)))

print(f"\nAnomaly detector - Binary validation set performance:")
print(f"Accuracy: {anomaly_val_accuracy:.4f}")
print(f"Precision: {anomaly_val_precision:.4f}")
print(f"Recall: {anomaly_val_recall:.4f}")
print(f"F1-score: {anomaly_val_f1:.4f}")

# Display confusion matrix for binary classification
anomaly_val_cm = confusion_matrix(y_bin_val, anomaly_val_pred)
print("\nConfusion Matrix (Binary Validation Set):")
print(f"True Negatives: {anomaly_val_cm[0, 0]} | False Positives: {anomaly_val_cm[0, 1]}")
print(f"False Negatives: {anomaly_val_cm[1, 0]} | True Positives: {anomaly_val_cm[1, 1]}")


Training OneClassSVM for anomaly detection using binary data...




OneClassSVM training completed in 272.16 seconds

Unsupervised Evaluation Metrics:
Silhouette Score: 0.241
Davies-Bouldin Index: 2.728

Prediction distribution (Binary Validation Set):
{0: 16823, 1: 10115}

Anomaly detector - Binary validation set performance:
Accuracy: 0.5284
Precision: 0.5418
Recall: 0.4045
F1-score: 0.4632

Confusion Matrix (Binary Validation Set):
True Negatives: 8754 | False Positives: 4635
False Negatives: 8069 | True Positives: 5480


In [5]:
# Cell 5: Train XGBoost for Attack Type Classification (Using Multi-class Data)
print("\nTraining XGBoost for attack type classification using multi-class data...")

# Extract attack samples for training
attack_indices_train = np.where(y_multi_train != 0)[0]
X_attack_train = X_multi_train[attack_indices_train]
y_attack_train = y_multi_train[attack_indices_train]
# Shift labels to start from 0 (1->0, 2->1, 3->2, 4->3)
y_attack_train_shifted = y_attack_train - 1

# Feature selection for attack classification
print("Performing feature selection...")
selector = SelectKBest(f_classif, k=40)
X_attack_train_selected = selector.fit_transform(X_attack_train, y_attack_train_shifted)

# Define adaptive sampling function for attack types
def adaptive_sampling(X, y, verbose=True):
    # Calculate class distributions
    class_counts = np.bincount(y)
    if verbose:
        print("Original attack class distribution:")
        print(class_counts)
    
    # Define sampling strategy for attack types
    sampling_strategy = {
        0: int(class_counts[0] * 1.0),  # DOS
        1: int(class_counts[1] * 1.2),  # Probe
        2: int(class_counts[2] * 2.0),  # R2L
        3: int(class_counts[3] * 3.0)   # U2R
    }
    
    # Apply SMOTE with adaptive strategy
    smote = SMOTE(
        sampling_strategy=sampling_strategy,
        random_state=42,
        k_neighbors=min(5, min(class_counts[class_counts > 0]) - 1)
    )
    
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    if verbose:
        print("\nAttack class distribution after sampling:")
        print(np.bincount(y_resampled))
    
    return X_resampled, y_resampled

# Apply adaptive sampling to handle class imbalance
print("Applying adaptive sampling to handle class imbalance...")
X_attack_train_balanced, y_attack_train_balanced = adaptive_sampling(
    X_attack_train_selected, y_attack_train_shifted
)

# Train XGBoost for attack type classification
start_time = time.time()

# Create base XGBoost model
base_model = XGBClassifier(
    n_estimators=150,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.5,
    reg_lambda=2,
    objective='multi:softmax',
    num_class=4,  # 4 attack types (excluding normal)
    random_state=42,
    tree_method='hist'
)

# Use Bagging to improve model performance
attack_classifier = BaggingClassifier(
    estimator=base_model,
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)

attack_classifier.fit(X_attack_train_balanced, y_attack_train_balanced)
attack_train_time = time.time() - start_time
print(f"Attack classifier training completed in {attack_train_time:.2f} seconds")

# Evaluate attack classifier on attack samples from training set
y_attack_train_pred = attack_classifier.predict(X_attack_train_selected)
attack_train_accuracy = accuracy_score(y_attack_train_shifted, y_attack_train_pred)
attack_train_precision, attack_train_recall, attack_train_f1, _ = precision_recall_fscore_support(
    y_attack_train_shifted, y_attack_train_pred, average='macro'
)

print(f"\nAttack classifier - Training set performance (attack samples only):")
print(f"Accuracy: {attack_train_accuracy:.4f}")
print(f"Macro-average precision: {attack_train_precision:.4f}")
print(f"Macro-average recall: {attack_train_recall:.4f}")
print(f"Macro-average F1-score: {attack_train_f1:.4f}")

# Display classification report for attack types
attack_train_report = classification_report(y_attack_train_shifted, y_attack_train_pred)
print("\nClassification Report (Training Set) - Attack Types:")
print(attack_train_report)


Training XGBoost for attack type classification using multi-class data...
Performing feature selection...
Applying adaptive sampling to handle class imbalance...
Original attack class distribution:
[36741  9325   796    42]

Attack class distribution after sampling:
[36741 11190  1592   126]
Attack classifier training completed in 1445.44 seconds

Attack classifier - Training set performance (attack samples only):
Accuracy: 0.9991
Macro-average precision: 0.9595
Macro-average recall: 0.9170
Macro-average F1-score: 0.9361

Classification Report (Training Set) - Attack Types:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36741
           1       1.00      1.00      1.00      9325
           2       0.99      0.98      0.98       796
           3       0.85      0.69      0.76        42

    accuracy                           1.00     46904
   macro avg       0.96      0.92      0.94     46904
weighted avg       1.00      1.00     

In [6]:
# Cell 6: Evaluate Hybrid Model on Multi-class Validation Set
print("\nEvaluating hybrid model on multi-class validation set...")

# Step 1: Use OneClassSVM to detect anomalies
# First, we need to scale the multi-class validation data using the binary scaler
X_multi_val_bin_scaled = bin_scaler.transform(X_multi_val)
anomaly_multi_val_pred_raw = anomaly_detector.predict(X_multi_val_bin_scaled)
anomaly_multi_val_pred = convert_predictions(anomaly_multi_val_pred_raw)

# Step 2: For samples classified as anomalies, predict the attack type
anomaly_indices_val = np.where(anomaly_multi_val_pred == 1)[0]
X_anomaly_multi_val = X_multi_val[anomaly_indices_val]

if len(anomaly_indices_val) > 0:
    # Apply feature selection
    X_anomaly_multi_val_selected = selector.transform(X_anomaly_multi_val)
    
    # Predict attack types
    y_attack_multi_val_pred = attack_classifier.predict(X_anomaly_multi_val_selected)
    # Shift back to original labels (0->1, 1->2, 2->3, 3->4)
    y_attack_multi_val_pred_shifted = y_attack_multi_val_pred + 1
    
    # Create final predictions
    y_multi_val_pred = np.zeros_like(y_multi_val)
    y_multi_val_pred[anomaly_indices_val] = y_attack_multi_val_pred_shifted
    
    # Calculate evaluation metrics
    val_accuracy = accuracy_score(y_multi_val, y_multi_val_pred)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_multi_val, y_multi_val_pred, average='macro')
    val_report = classification_report(y_multi_val, y_multi_val_pred)
    val_confusion = confusion_matrix(y_multi_val, y_multi_val_pred)
    
    print(f"\nHybrid model - Multi-class validation set performance:")
    print(f"Accuracy: {val_accuracy:.4f}")
    print(f"Macro-average precision: {val_precision:.4f}")
    print(f"Macro-average recall: {val_recall:.4f}")
    print(f"Macro-average F1-score: {val_f1:.4f}")
    print("\nClassification Report (Multi-class Validation Set):")
    print(val_report)
    print("\nConfusion Matrix (Multi-class Validation Set):")
    print(val_confusion)
else:
    print("No anomalies detected in validation set by OneClassSVM.")


Evaluating hybrid model on multi-class validation set...

Hybrid model - Multi-class validation set performance:
Accuracy: 0.4658
Macro-average precision: 0.4296
Macro-average recall: 0.7350
Macro-average F1-score: 0.3501

Classification Report (Multi-class Validation Set):
              precision    recall  f1-score   support

           0       0.87      0.00      0.00     13469
           1       0.51      1.00      0.68      9186
           2       0.56      1.00      0.72      2331
           3       0.06      0.97      0.12       199
           4       0.14      0.70      0.23        10

    accuracy                           0.47     25195
   macro avg       0.43      0.73      0.35     25195
weighted avg       0.71      0.47      0.32     25195


Confusion Matrix (Multi-class Validation Set):
[[  27 8729 1791 2880   42]
 [   4 9179    3    0    0]
 [   0    1 2328    2    0]
 [   0    0    3  194    2]
 [   0    2    1    0    7]]


In [7]:
# Cell 7: Evaluate Hybrid Model on Multi-class Test Set
print("\nEvaluating hybrid model on multi-class test set...")

# Load multi-class test data
df_multi_test = pd.read_csv(multi_processed_test_path)
X_multi_test = df_multi_test.drop('multiclass_label', axis=1).values
y_multi_test = df_multi_test['multiclass_label'].values

# Create binary labels from multi-class labels for binary evaluation
y_multi_binary_test = np.where(y_multi_test == 0, 0, 1)

# Display distribution of classes in test data
test_class_dist = pd.Series(y_multi_test).value_counts().sort_index()
print("\nClass distribution in multi-class test data:")
for class_id, count in test_class_dist.items():
    print(f"Class {class_id} ({class_names[class_id]}): {count} samples ({count/len(y_multi_test)*100:.2f}%)")

# Scale multi-class test data using binary scaler for anomaly detection
X_multi_test_bin_scaled = bin_scaler.transform(X_multi_test)

# Step 1: Use OneClassSVM to detect anomalies
anomaly_multi_test_pred_raw = anomaly_detector.predict(X_multi_test_bin_scaled)
anomaly_multi_test_pred = convert_predictions(anomaly_multi_test_pred_raw)

# Calculate binary classification metrics
anomaly_test_accuracy = accuracy_score(y_multi_binary_test, anomaly_multi_test_pred)
anomaly_test_precision, anomaly_test_recall, anomaly_test_f1, _ = precision_recall_fscore_support(
    y_multi_binary_test, anomaly_multi_test_pred, average='binary'
)

print(f"\nAnomaly detector - Test set performance (binary):")
print(f"Accuracy: {anomaly_test_accuracy:.4f}")
print(f"Precision: {anomaly_test_precision:.4f}")
print(f"Recall: {anomaly_test_recall:.4f}")
print(f"F1-score: {anomaly_test_f1:.4f}")

# Display confusion matrix for binary classification
anomaly_test_cm = confusion_matrix(y_multi_binary_test, anomaly_multi_test_pred)
print("\nConfusion Matrix (Test Set) - Binary Classification:")
print(f"True Negatives: {anomaly_test_cm[0, 0]} | False Positives: {anomaly_test_cm[0, 1]}")
print(f"False Negatives: {anomaly_test_cm[1, 0]} | True Positives: {anomaly_test_cm[1, 1]}")

# Step 2: For samples classified as anomalies, predict the attack type
anomaly_indices_test = np.where(anomaly_multi_test_pred == 1)[0]
X_anomaly_multi_test = X_multi_test[anomaly_indices_test]

if len(anomaly_indices_test) > 0:
    # Apply feature selection
    X_anomaly_multi_test_selected = selector.transform(X_anomaly_multi_test)
    
    # Predict attack types
    y_attack_multi_test_pred = attack_classifier.predict(X_anomaly_multi_test_selected)
    # Shift back to original labels
    y_attack_multi_test_pred_shifted = y_attack_multi_test_pred + 1
    
    # Create final predictions
    y_multi_test_pred = np.zeros_like(y_multi_test)
    y_multi_test_pred[anomaly_indices_test] = y_attack_multi_test_pred_shifted
    
    # Calculate evaluation metrics
    test_accuracy = accuracy_score(y_multi_test, y_multi_test_pred)
    test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_multi_test, y_multi_test_pred, average='macro')
    test_report = classification_report(y_multi_test, y_multi_test_pred)
    test_confusion = confusion_matrix(y_multi_test, y_multi_test_pred)
    
    print(f"\nHybrid model - Multi-class test set performance:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Macro-average precision: {test_precision:.4f}")
    print(f"Macro-average recall: {test_recall:.4f}")
    print(f"Macro-average F1-score: {test_f1:.4f}")
    print("\nClassification Report (Multi-class Test Set):")
    print(test_report)
    print("\nConfusion Matrix (Multi-class Test Set):")
    print(test_confusion)
    
    # Calculate per-class metrics
    print("\nPer-class performance:")
    for i in range(5):  # 0-4 classes
        class_indices = np.where(y_multi_test == i)[0]
        if len(class_indices) > 0:
            class_acc = accuracy_score(y_multi_test[class_indices], y_multi_test_pred[class_indices])
            print(f"Class {i} ({class_names[i]}) accuracy: {class_acc:.4f}")
else:
    print("No anomalies detected in test set by OneClassSVM.")


Evaluating hybrid model on multi-class test set...

Class distribution in multi-class test data:
Class 0 (Normal Traffic): 9711 samples (43.08%)
Class 1 (DOS (Denial of Service)): 7458 samples (33.08%)
Class 2 (Probe (Surveillance/Scanning)): 2421 samples (10.74%)
Class 3 (R2L (Remote to Local)): 2887 samples (12.81%)
Class 4 (U2R (User to Root)): 67 samples (0.30%)

Anomaly detector - Test set performance (binary):
Accuracy: 0.5706
Precision: 0.5700
Recall: 0.9998
F1-score: 0.7261

Confusion Matrix (Test Set) - Binary Classification:
True Negatives: 33 | False Positives: 9678
False Negatives: 2 | True Positives: 12831

Hybrid model - Multi-class test set performance:
Accuracy: 0.3785
Macro-average precision: 0.6539
Macro-average recall: 0.3992
Macro-average F1-score: 0.3173

Classification Report (Multi-class Test Set):
              precision    recall  f1-score   support

           0       0.94      0.00      0.01      9711
           1       0.39      0.85      0.54      7458
   

In [8]:
# Cell 8: Save Trained Models
print("\nSaving models...")
# Create Paths to Save Models
model_dir = '/root/autodl-tmp/projects/USL_NSL/notebooks/multi/hybrid/models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: {model_dir}")

models = {
    'bin_scaler': bin_scaler,
    'multi_scaler': multi_scaler,
    'anomaly_detector': anomaly_detector,
    'attack_classifier': attack_classifier,
    'feature_selector': selector,
    'class_names': class_names
}

model_path = '/root/autodl-tmp/projects/USL_NSL/notebooks/multi/hybrid/models/improved_hybrid_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(models, f)
print(f"Models saved to {model_path}")


Saving models...
Models saved to /root/autodl-tmp/projects/USL_NSL/notebooks/multi/hybrid/models/improved_hybrid_model.pkl
