In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTest_processed.csv'

In [None]:
# Load training data
print("Loading processed training data...")
df_train = pd.read_csv(processed_train_path)

# Select features and labels
X = df_train.drop(columns=['label', 'binary_label'])
y_binary = df_train['binary_label']

# Display data info
print("Feature matrix shape:", X.shape)
print("Label distribution:")
print(y_binary.value_counts())

Loading processed training data...
Feature matrix shape: (125973, 108)
Label distribution:
binary_label
0    67343
1    58630
Name: count, dtype: int64


In [None]:
# Split into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42
)

print("Dataset split complete")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_test.shape[0]}")

Dataset split complete
Training samples: 100778
Validation samples: 25195


In [None]:
# Create and train Random Forest model
print("\nTraining Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

# Create and train XGBoost model
print("\nTraining XGBoost model...")
xgb_model = XGBClassifier(n_estimators=50, random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importance from both models
rf_importance = pd.DataFrame({
    'feature': X.columns,
    'rf_importance': rf_model.feature_importances_
})

xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'xgb_importance': xgb_model.feature_importances_
})

# Combine feature importance
combined_importance = rf_importance.merge(xgb_importance, on='feature')
combined_importance['avg_importance'] = (combined_importance['rf_importance'] + combined_importance['xgb_importance']) / 2

print("\nTop 10 most important features (averaged):")
display(combined_importance.sort_values('avg_importance', ascending=False).head(10))



Training Random Forest model...

Training XGBoost model...

Top 10 most important features (averaged):


Unnamed: 0,feature,rf_importance,xgb_importance,avg_importance
96,src_bytes,0.183422,0.213946,0.198684
16,service_ecr_i,0.009234,0.357669,0.183451
101,dst_bytes,0.11315,0.022643,0.067896
13,diff_srv_rate,0.080444,0.050648,0.065546
5,service_http,0.010844,0.103912,0.057378
1,dst_host_srv_count,0.104179,0.004264,0.054221
0,flag_SF,0.082369,0.002764,0.042566
2,logged_in,0.052964,0.027883,0.040424
10,dst_host_diff_srv_rate,0.060212,0.008997,0.034605
4,count,0.049262,0.011423,0.030343


In [None]:
# Function to get ensemble predictions
def get_ensemble_predictions(X_data, rf_weight=0.4, xgb_weight=0.6):
    """
    Get weighted ensemble predictions from RF and XGBoost models
    """
    # Get probability predictions from both models
    rf_probs = rf_model.predict_proba(X_data)
    xgb_probs = xgb_model.predict_proba(X_data)
    
    # Weighted average of probabilities
    ensemble_probs = rf_probs * rf_weight + xgb_probs * xgb_weight
    
    # Convert to class predictions
    return np.argmax(ensemble_probs, axis=1)

# Evaluate on validation set
print("\nEvaluating ensemble on validation set...")
y_pred = get_ensemble_predictions(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Validation Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Evaluating ensemble on validation set...
Validation Accuracy: 1.00

Confusion Matrix:
[[13409    13]
 [   17 11756]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195



In [None]:
# Load and evaluate on external test set
print("\nEvaluating ensemble on external test set...")
df_test = pd.read_csv(processed_test_path)

X_external_test = df_test.drop(columns=['label', 'binary_label'])
y_external_test = df_test['binary_label']

# Make predictions
y_external_pred = get_ensemble_predictions(X_external_test)

# Calculate metrics
accuracy_external = accuracy_score(y_external_test, y_external_pred)
conf_matrix_external = confusion_matrix(y_external_test, y_external_pred)
class_report_external = classification_report(y_external_test, y_external_pred)

print(f"External Test Set Accuracy: {accuracy_external:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix_external)
print("\nClassification Report:")
print(class_report_external)


Evaluating ensemble on external test set...
External Test Set Accuracy: 0.86

Confusion Matrix:
[[ 6811  2900]
 [  364 12469]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.70      0.81      9711
           1       0.81      0.97      0.88     12833

    accuracy                           0.86     22544
   macro avg       0.88      0.84      0.85     22544
weighted avg       0.87      0.86      0.85     22544

