In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Configure visualization settings
sns.set_palette("husl")

In [None]:
# Data Loading
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTrain_processed.csv'
df_train = pd.read_csv(processed_train_path)

X = df_train.drop(columns=['label', 'binary_label'])
y_binary = df_train['binary_label']

print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Class distribution:\n{y_binary.value_counts()}")

In [None]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, 
    test_size=0.2,
    random_state=42,
    stratify=y_binary
)

In [None]:
# Model Configuration
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    scale_pos_weight=sum(y_binary == 0) / sum(y_binary == 1),
    random_state=42,
    n_estimators=1000,
    early_stopping_rounds=50
)

# Pipeline Setup and Training
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', xgb_classifier)
])

pipeline.fit(
    X_train, 
    y_train,
    classifier__eval_set=[(X_test, y_test)],
    classifier__verbose=100
)

model = pipeline.named_steps['classifier']

In [None]:
# Feature Importance Analysis
importance_scores = model.feature_importances_
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importance_scores
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df.head(15), x='importance', y='feature')
plt.title('Feature Importance Analysis')
plt.tight_layout()
plt.show()

In [None]:
# Validation Set Performance
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = pipeline.predict(X_test)

validation_metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'confusion_matrix': confusion_matrix(y_test, y_pred),
    'classification_report': classification_report(y_test, y_pred)
}

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
validation_metrics['roc_auc'] = auc(fpr, tpr)

print("\nValidation Metrics:")
print(f"Accuracy: {validation_metrics['accuracy']:.4f}")
print(f"ROC AUC: {validation_metrics['roc_auc']:.4f}")
print("\nConfusion Matrix:")
print(validation_metrics['confusion_matrix'])
print("\nClassification Report:")
print(validation_metrics['classification_report'])

In [None]:
# External Test Set Evaluation
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTest_processed.csv'
df_test = pd.read_csv(processed_test_path)

X_external = df_test.drop(columns=['label', 'binary_label'])
y_external = df_test['binary_label']

y_external_proba = pipeline.predict_proba(X_external)[:, 1]
y_external_pred = pipeline.predict(X_external)

test_metrics = {
    'accuracy': accuracy_score(y_external, y_external_pred),
    'confusion_matrix': confusion_matrix(y_external, y_external_pred),
    'classification_report': classification_report(y_external, y_external_pred)
}

print("\nExternal Test Set Metrics:")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print("\nConfusion Matrix:")
print(test_metrics['confusion_matrix'])
print("\nClassification Report:")
print(test_metrics['classification_report'])

In [None]:
# Performance Summary
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Normal Precision', 'Normal Recall', 'Anomaly Precision', 'Anomaly Recall'],
    'Validation': [
        validation_metrics['accuracy'],
        float(validation_metrics['classification_report'].split('\n')[2].split()[2]),
        float(validation_metrics['classification_report'].split('\n')[2].split()[3]),
        float(validation_metrics['classification_report'].split('\n')[3].split()[2]),
        float(validation_metrics['classification_report'].split('\n')[3].split()[3])
    ],
    'Test': [
        test_metrics['accuracy'],
        float(test_metrics['classification_report'].split('\n')[2].split()[2]),
        float(test_metrics['classification_report'].split('\n')[2].split()[3]),
        float(test_metrics['classification_report'].split('\n')[3].split()[2]),
        float(test_metrics['classification_report'].split('\n')[3].split()[3])
    ]
})

print("\nPerformance Summary:")
print(metrics_df.round(4))