In [31]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import numpy as np
from cuml.model_selection import train_test_split
from xgboost import XGBClassifier
from cuml.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTest_processed.csv'

In [32]:
# Cell 2: Load and Preprocess Training Data
# Load training data
print("Loading processed training data...")
df_train = cudf.read_csv(processed_train_path)

# Select features and labels
X = df_train.drop(columns=['label', 'binary_label'])
y_binary = df_train['binary_label']

# Display data info
print("Feature matrix shape:", X.shape)
print("Label distribution:")
print(y_binary.value_counts())

Loading processed training data...
Feature matrix shape: (125973, 108)
Label distribution:
0    67343
1    58630
Name: binary_label, dtype: int64


In [33]:
# Cell 3: Split Dataset into Training and Validation Sets
# Split into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42
)

print("Dataset split complete")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_test.shape[0]}")

Dataset split complete
Training samples: 100779
Validation samples: 25194


In [34]:
# Cell 4: Train Model and Analyze Feature Importance
# Create and train XGBoost model with GPU acceleration
print("\nTraining XGBoost model...")
xgb_binary = XGBClassifier(
    n_estimators=50, 
    random_state=42,
    tree_method='hist',
    device='cuda'
)
xgb_binary.fit(X_train, y_train)

# Display feature importance
feature_importance = cudf.DataFrame({
    'feature': X.columns,
    'importance': xgb_binary.feature_importances_
})
print("\nTop 10 most important features:")
feature_importance = feature_importance.sort_values('importance', ascending=False).head(10)
print(feature_importance.to_pandas())


Training XGBoost model...

Top 10 most important features:
              feature  importance
16      service_ecr_i    0.374037
96          src_bytes    0.195503
5        service_http    0.125345
13      diff_srv_rate    0.056305
2           logged_in    0.032049
19     wrong_fragment    0.031102
21   service_ftp_data    0.020335
6     service_private    0.015249
101         dst_bytes    0.014588
71     is_guest_login    0.012392


In [35]:
# Cell 5: Evaluate Model on Validation Set
# Predict and evaluate on validation set
print("\nEvaluating on validation set...")
y_pred = xgb_binary.predict(X_test)

# Convert to numpy arrays for sklearn metrics
y_test_np = y_test.to_numpy()
y_pred_np = y_pred if isinstance(y_pred, np.ndarray) else y_pred.to_numpy()

# Calculate metrics
accuracy = accuracy_score(y_test_np, y_pred_np)
conf_matrix = confusion_matrix(y_test_np, y_pred_np)
class_report = classification_report(y_test_np, y_pred_np)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Evaluating on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13482    17]
 [   15 11680]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13499
           1       1.00      1.00      1.00     11695

    accuracy                           1.00     25194
   macro avg       1.00      1.00      1.00     25194
weighted avg       1.00      1.00      1.00     25194



In [36]:
# Cell 6: Evaluate Model on External Test Set
# Load and evaluate on external test set
print("\nEvaluating on external test set...")
df_test = cudf.read_csv(processed_test_path)

X_external_test = df_test.drop(columns=['label', 'binary_label'])
y_external_test = df_test['binary_label']

# Make predictions
y_external_pred = xgb_binary.predict(X_external_test)

# Convert to numpy arrays for sklearn metrics
y_external_test_np = y_external_test.to_numpy()
y_external_pred_np = y_external_pred if isinstance(y_external_pred, np.ndarray) else y_external_pred.to_numpy()

# Calculate metrics
accuracy_external = accuracy_score(y_external_test_np, y_external_pred_np)
conf_matrix_external = confusion_matrix(y_external_test_np, y_external_pred_np)
class_report_external = classification_report(y_external_test_np, y_external_pred_np)

print(f"External Test Set Accuracy: {accuracy_external:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix_external)
print("\nClassification Report:")
print(class_report_external)


Evaluating on external test set...
External Test Set Accuracy: 0.88

Confusion Matrix:
[[ 7257  2454]
 [  144 12689]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.75      0.85      9711
           1       0.84      0.99      0.91     12833

    accuracy                           0.88     22544
   macro avg       0.91      0.87      0.88     22544
weighted avg       0.90      0.88      0.88     22544

