In [7]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
from cuml import RandomForestClassifier
from cuml.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from cuml.metrics.confusion_matrix import confusion_matrix
from sklearn.metrics import classification_report

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTest_processed.csv'

In [8]:
# Cell 2: Load training data
print("Loading processed training data...")
df_train = cudf.read_csv(processed_train_path)

# Handle missing values in training data
print("\nChecking for missing values in training data:")
print(df_train.isnull().sum().to_pandas())

# Fill missing values if any exist
df_train = df_train.fillna(df_train.mean())

# Select features and labels
X = df_train.drop(columns=['multiclass_label']).astype('float32')
y_multi = df_train['multiclass_label']

# Display data info
print("\nFeature matrix shape:", X.shape)
print("Label distribution:")
print(y_multi.value_counts().to_pandas())

Loading processed training data...

Checking for missing values in training data:
dst_host_srv_count    0
logged_in             0
flag_SF               0
service_http          0
service_private       0
                     ..
urgent                0
land                  0
is_host_login         0
service_tim_i         0
multiclass_label      0
Length: 110, dtype: int64

Feature matrix shape: (125973, 109)
Label distribution:
0    67343
1    45927
2    11656
3      995
4       52
Name: multiclass_label, dtype: int64


In [9]:
# Cell 3: Split into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_multi, test_size=0.2, random_state=42,
)

# Data type conversion
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

print("Dataset split complete")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_test.shape[0]}")

Dataset split complete
Training samples: 100778
Validation samples: 25195


In [10]:
# Cell 4: Create and train Random Forest model
print("\nTraining Random Forest model on GPU...")
rf_multi = RandomForestClassifier(n_estimators=100, random_state=42, n_streams=1)
rf_multi.fit(X_train, y_train)

# Predict and evaluate on validation set
print("\nEvaluating on validation set...")
y_pred = rf_multi.predict(X_test)

# Data type conversion
y_pred = y_pred.astype('int32')
y_test = y_test.astype('int32')

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert to CPU for classification report
y_test_cpu = y_test.to_pandas()
y_pred_cpu = y_pred.to_pandas()
class_report = classification_report(y_test_cpu, y_pred_cpu)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Training Random Forest model on GPU...

Evaluating on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13413     4     5     0     0]
 [   10  9171     0     0     0]
 [   32     2  2323     0     0]
 [   26     0     0   198     0]
 [   10     0     0     0     1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     13422
           1       1.00      1.00      1.00      9181
           2       1.00      0.99      0.99      2357
           3       1.00      0.88      0.94       224
           4       1.00      0.09      0.17        11

    accuracy                           1.00     25195
   macro avg       1.00      0.79      0.82     25195
weighted avg       1.00      1.00      1.00     25195



In [11]:
# Cell 5: Predict and evaluate on validation set again
print("\nEvaluating on validation set...")

# Ensure data types
y_test = y_test.astype('int32')

# Make predictions and convert to int32
y_pred = rf_multi.predict(X_test)
y_pred = cudf.Series(y_pred).astype('int32')

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert to CPU for classification report
y_test_cpu = y_test.to_pandas()
y_pred_cpu = y_pred.to_pandas()
class_report = classification_report(y_test_cpu, y_pred_cpu)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Evaluating on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13413     4     5     0     0]
 [   10  9171     0     0     0]
 [   32     2  2323     0     0]
 [   26     0     0   198     0]
 [   10     0     0     0     1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     13422
           1       1.00      1.00      1.00      9181
           2       1.00      0.99      0.99      2357
           3       1.00      0.88      0.94       224
           4       1.00      0.09      0.17        11

    accuracy                           1.00     25195
   macro avg       1.00      0.79      0.82     25195
weighted avg       1.00      1.00      1.00     25195



In [12]:
# Cell 6: Load and evaluate on external test set
print("\nEvaluating on external test set...")
df_test = cudf.read_csv(processed_test_path)

# Check for missing values in test data
print("\nChecking for missing values in test data:")
print(df_test.isnull().sum().to_pandas())

# Fill missing values in test data
df_test = df_test.fillna(df_test.mean())

X_external_test = df_test.drop(columns=['multiclass_label']).astype('float32')
y_external_test = df_test['multiclass_label'].astype('int32')

# Make predictions
y_external_pred = rf_multi.predict(X_external_test)
y_external_pred = cudf.Series(y_external_pred).astype('int32')

# Verify no missing values
print("\nVerifying no missing values in predictions and labels:")
print("Missing in predictions:", y_external_pred.isnull().sum())
print("Missing in actual labels:", y_external_test.isnull().sum())

# Calculate metrics
accuracy_external = accuracy_score(y_external_test, y_external_pred)
conf_matrix_external = confusion_matrix(y_external_test, y_external_pred)

# Convert to CPU for classification report
y_external_test_cpu = y_external_test.to_pandas()
y_external_pred_cpu = y_external_pred.to_pandas()
class_report_external = classification_report(y_external_test_cpu, y_external_pred_cpu, zero_division=0)

print(f"\nExternal Test Set Accuracy: {accuracy_external:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix_external)
print("\nClassification Report:")
print(class_report_external)

# Print the distribution of predicted labels
print("\nPrediction distribution:")
print(cudf.Series(y_external_pred).value_counts().sort_index().to_pandas())
print("\nTrue label distribution:")
print(cudf.Series(y_external_test).value_counts().sort_index().to_pandas())


Evaluating on external test set...

Checking for missing values in test data:
dst_host_srv_count       0
logged_in                0
flag_SF                  0
service_http             0
service_private          0
                      ... 
urgent                   0
land                     0
is_host_login            0
service_tim_i            0
multiclass_label      2420
Length: 110, dtype: int64

Verifying no missing values in predictions and labels:
Missing in predictions: 0
Missing in actual labels: 0

External Test Set Accuracy: 0.73

Confusion Matrix:
[[9631 2050  450    0    0]
 [ 354 5387    0    0    0]
 [ 390  639 1392    0    0]
 [1848  347    4    0    0]
 [  52    0    0    0    0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.79      0.79     12131
           1       0.64      0.94      0.76      5741
           2       0.75      0.57      0.65      2421
           3       0.00      0.00      0.00      2199