In [18]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import numpy as np
from cuml.model_selection import train_test_split
from xgboost import XGBClassifier
from cuml.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTest_processed.csv'

In [19]:
# Cell 2: Load and Preprocess Training Data
print("Loading processed training data...")
df_train = cudf.read_csv(processed_train_path)

# Handle missing values in training data
print("\nChecking for missing values in training data:")
print(df_train.isnull().sum().to_pandas())

# Fill missing values if any exist
df_train = df_train.fillna(df_train.mean())

# Select features and labels
X = df_train.drop(columns=['multiclass_label']).astype('float32')
y_multi = df_train['multiclass_label']

# Display data info
print("\nFeature matrix shape:", X.shape)
print("Label distribution:")
print(y_multi.value_counts().to_pandas())

Loading processed training data...

Checking for missing values in training data:
dst_host_srv_count    0
logged_in             0
flag_SF               0
service_http          0
service_private       0
                     ..
urgent                0
land                  0
is_host_login         0
service_tim_i         0
multiclass_label      0
Length: 110, dtype: int64

Feature matrix shape: (125973, 109)
Label distribution:
0    67343
1    45927
2    11656
3      995
4       52
Name: multiclass_label, dtype: int64


In [20]:
# Cell 3: Split Dataset into Training and Validation Sets
print("\nSplitting dataset into training and validation sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_multi, test_size=0.2, random_state=42
)

print("Dataset split complete")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_test.shape[0]}")


Splitting dataset into training and validation sets...
Dataset split complete
Training samples: 100779
Validation samples: 25194


In [21]:
# Cell 4: Train XGBoost Model and Analyze Feature Importance
print("\nTraining XGBoost model...")
xgb_multi = XGBClassifier(
    n_estimators=100, 
    random_state=42,
    tree_method='hist',
    device='cuda',
    objective='multi:softmax',  # Use softmax for multi-class classification
    num_class=5  # Number of classes in the target variable
)
xgb_multi.fit(X_train, y_train)

# Display feature importance
feature_importance = cudf.DataFrame({
    'feature': X.columns,
    'importance': xgb_multi.feature_importances_
})
print("\nTop 10 most important features:")
feature_importance = feature_importance.sort_values('importance', ascending=False).head(10)
print(feature_importance.to_pandas())


Training XGBoost model...

Top 10 most important features:
                     feature  importance
8              service_eco_i    0.271069
21             service_ecr_i    0.203579
12             diff_srv_rate    0.111916
3               service_http    0.078523
7   dst_host_srv_serror_rate    0.072739
90                 src_bytes    0.060145
28            wrong_fragment    0.054443
6                      count    0.020290
5     dst_host_diff_srv_rate    0.015705
22                       hot    0.014073


In [22]:
# Cell 5: Evaluate Model on Validation Set
print("\nEvaluating on validation set...")
y_pred = xgb_multi.predict(X_test)

# Convert to numpy arrays for sklearn metrics
y_test_np = y_test.to_numpy()
y_pred_np = y_pred if isinstance(y_pred, np.ndarray) else y_pred.to_numpy()

# Calculate metrics
accuracy = accuracy_score(y_test_np, y_pred_np)
conf_matrix = confusion_matrix(y_test_np, y_pred_np)
class_report = classification_report(y_test_np, y_pred_np)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Evaluating on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13490     0     4     0     4]
 [    2     0  9119     0     0]
 [    3     0     1     0  2362]
 [    3     0     0     0     0]
 [    1     0     0     0     1]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13499
           1       1.00      1.00      1.00      9121
           2       1.00      1.00      1.00      2366
           3       0.99      0.98      0.99       198
           4       0.88      0.70      0.78        10

    accuracy                           1.00     25194
   macro avg       0.97      0.94      0.95     25194
weighted avg       1.00      1.00      1.00     25194



In [23]:
# Cell 6: Evaluate Model on External Test Set
print("\nEvaluating on external test set...")
df_test = cudf.read_csv(processed_test_path)

# Check for missing values in test data
print("\nChecking for missing values in test data:")
print(df_test.isnull().sum().to_pandas())

# Fill missing values in test data
df_test = df_test.fillna(df_test.mean())

X_external_test = df_test.drop(columns=['multiclass_label']).astype('float32')
y_external_test = df_test['multiclass_label'].astype('int32')

# Make predictions
y_external_pred = xgb_multi.predict(X_external_test)

# Convert to numpy arrays for sklearn metrics
y_external_test_np = y_external_test.to_numpy()
y_external_pred_np = y_external_pred if isinstance(y_external_pred, np.ndarray) else y_external_pred.to_numpy()

# Calculate metrics
accuracy_external = accuracy_score(y_external_test_np, y_external_pred_np)
conf_matrix_external = confusion_matrix(y_external_test_np, y_external_pred_np)
class_report_external = classification_report(y_external_test_np, y_external_pred_np, zero_division=0)

print(f"\nExternal Test Set Accuracy: {accuracy_external:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix_external)
print("\nClassification Report:")
print(class_report_external)

# Print the distribution of predicted labels
print("\nPrediction distribution:")
print(cudf.Series(y_external_pred).value_counts().sort_index().to_pandas())
print("\nTrue label distribution:")
print(cudf.Series(y_external_test).value_counts().sort_index().to_pandas())


Evaluating on external test set...

Checking for missing values in test data:
dst_host_srv_count       0
logged_in                0
flag_SF                  0
service_http             0
service_private          0
                      ... 
urgent                   0
land                     0
is_host_login            0
service_tim_i            0
multiclass_label      2420
Length: 110, dtype: int64

External Test Set Accuracy: 0.65

Confusion Matrix:
[[8064 2351 1710    0    6]
 [ 195 4852  694    0    0]
 [ 294  487 1635    5    0]
 [ 837  886  329  145    2]
 [  19   18    7    3    5]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.66      0.75     12131
           1       0.56      0.85      0.68      5741
           2       0.37      0.68      0.48      2421
           3       0.95      0.07      0.12      2199
           4       0.38      0.10      0.15        52

    accuracy                           0.65     22544
