In [76]:
# Cell 1: Imports and Path Definition
import pandas as pd
import numpy as np
import cudf
import cupy as cp
from sklearn.model_selection import train_test_split
from cuml import RandomForestClassifier as cuRF
from xgboost import XGBClassifier
from cuml.metrics import accuracy_score
from cuml.metrics.confusion_matrix import confusion_matrix
from sklearn.metrics import classification_report

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/bin/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/bin/KDDTest_processed.csv'

In [77]:
# Cell 2: Load and Prepare Training Data
print("Loading processed training data...")
df_train = cudf.read_csv(processed_train_path)

# Select features and labels and convert to appropriate types
X = df_train.drop(columns=['binary_label']).astype('float32')
y_binary = df_train['binary_label'].astype('int32')

# Display data info
print("Feature matrix shape:", X.shape)
print("Label distribution:")
print(y_binary.value_counts().to_pandas())

Loading processed training data...
Feature matrix shape: (125973, 108)
Label distribution:
0    67343
1    58630
Name: binary_label, dtype: int64


In [78]:
# Cell 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42
)

# Convert data types
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('int32')
y_test = y_test.astype('int32')

print("Dataset split complete")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_test.shape[0]}")

Dataset split complete
Training samples: 100778
Validation samples: 25195


In [79]:
# Cell 4: Model Training and Feature Importance Analysis
# Create and train Random Forest model (GPU)
print("\nTraining Random Forest model on GPU...")
rf_model = cuRF(
    n_estimators=50,
    random_state=42,
    n_streams=1
)
rf_model.fit(X_train, y_train)

# Create and train XGBoost model (GPU)
print("\nTraining XGBoost model on GPU...")
xgb_model = XGBClassifier(
    n_estimators=50,
    random_state=42,
    tree_method='hist',
    device='cuda:0',
    max_depth=8,
    learning_rate=0.1,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

# Train XGBoost model
xgb_model.fit(X_train.to_pandas(), y_train.to_pandas(), verbose=True)

# Get XGBoost feature importance
print("\nTop 10 most important features (XGBoost):")
feature_importance = pd.DataFrame({
    'feature': X.columns.values,
    'importance': xgb_model.feature_importances_
})
display(feature_importance.sort_values('importance', ascending=False).head(10))


Training Random Forest model on GPU...

Training XGBoost model on GPU...

Top 10 most important features (XGBoost):


Unnamed: 0,feature,importance
0,flag_SF,0.209945
96,src_bytes,0.164143
16,service_ecr_i,0.141934
14,protocol_type_icmp,0.092086
5,service_http,0.058661
101,dst_bytes,0.052461
13,diff_srv_rate,0.02977
6,service_private,0.022431
4,count,0.022201
2,logged_in,0.016933


In [80]:
# Cell 5: Validation Set Evaluation
def get_ensemble_predictions(X_data, rf_weight=0.4, xgb_weight=0.6):
    """
    Get weighted ensemble predictions from RF and XGBoost models
    """
    # Convert X_data to appropriate formats for each model
    X_data_gpu = X_data.astype('float32') if isinstance(X_data, cudf.DataFrame) else cudf.DataFrame(X_data).astype('float32')
    X_data_cpu = X_data.to_pandas() if isinstance(X_data, cudf.DataFrame) else X_data
    
    # Get probability predictions from both models
    rf_probs = rf_model.predict_proba(X_data_gpu)
    xgb_probs = xgb_model.predict_proba(X_data_cpu)
    
    # Convert predictions to numpy arrays
    rf_probs = cp.asnumpy(rf_probs) if isinstance(rf_probs, cp.ndarray) else rf_probs.to_numpy()
    xgb_probs = np.array(xgb_probs)
    
    # Weighted average of probabilities
    ensemble_probs = rf_probs * rf_weight + xgb_probs * xgb_weight
    
    # Convert to class predictions
    return np.argmax(ensemble_probs, axis=1)

# Evaluate on validation set
print("\nEvaluating ensemble on validation set...")
y_pred = get_ensemble_predictions(X_test)
y_pred = cudf.Series(y_pred).astype('int32')

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert to CPU for classification report
y_test_cpu = y_test.to_pandas()
y_pred_cpu = y_pred.to_pandas()
class_report = classification_report(y_test_cpu, y_pred_cpu)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Evaluating ensemble on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13409    13]
 [   29 11744]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195



In [81]:
# Cell 6: External Test Set Evaluation
print("\nEvaluating ensemble on external test set...")
df_test = cudf.read_csv(processed_test_path)

X_external_test = df_test.drop(columns=['binary_label']).astype('float32')
y_external_test = df_test['binary_label'].astype('int32')

# Make predictions
y_external_pred = get_ensemble_predictions(X_external_test)
y_external_pred = cudf.Series(y_external_pred).astype('int32')

# Calculate metrics
accuracy_external = accuracy_score(y_external_test, y_external_pred)
conf_matrix_external = confusion_matrix(y_external_test, y_external_pred)

# Convert to CPU for classification report
y_external_test_cpu = y_external_test.to_pandas()
y_external_pred_cpu = y_external_pred.to_pandas()
class_report_external = classification_report(y_external_test_cpu, y_external_pred_cpu)

print(f"External Test Set Accuracy: {accuracy_external:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix_external)
print("\nClassification Report:")
print(class_report_external)


Evaluating ensemble on external test set...
External Test Set Accuracy: 0.86

Confusion Matrix:
[[ 7091  2620]
 [  527 12306]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.73      0.82      9711
           1       0.82      0.96      0.89     12833

    accuracy                           0.86     22544
   macro avg       0.88      0.84      0.85     22544
weighted avg       0.87      0.86      0.86     22544

