In [25]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
import numpy as np
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from cuml.model_selection import train_test_split
from cuml.metrics.confusion_matrix import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, f1_score
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTest_processed.csv'

In [26]:
# Cell 2: Define Data Loading and Preprocessing Functions

def load_and_preprocess_data(train_path, test_path, verbose=True):
    if verbose:
        print("Loading and preprocessing data...")
    
    # Load training data
    df_train = cudf.read_csv(train_path)
    df_test = cudf.read_csv(test_path)
    
    # Convert feature columns to float32
    feature_cols = [col for col in df_train.columns if col != 'multiclass_label']
    df_train[feature_cols] = df_train[feature_cols].astype('float32')
    df_test[feature_cols] = df_test[feature_cols].astype('float32')
    
    # Handle missing values
    df_train = df_train.fillna(df_train.mean())
    df_test = df_test.fillna(df_test.mean())
    
    # Split features and labels
    X_train = df_train[feature_cols]
    y_train = df_train['multiclass_label'].astype('int32')
    X_test = df_test[feature_cols]
    y_test = df_test['multiclass_label'].astype('int32')
    
    if verbose:
        print("Class distribution in training data:")
        print(y_train.value_counts().sort_index().to_pandas())
    
    return X_train, X_test, y_train, y_test

In [27]:
# Cell 3: Define Improved Sampling Function

def adaptive_sampling(X, y, verbose=False):
    X_cpu = X.to_pandas().values if isinstance(X, cudf.DataFrame) else X
    y_cpu = y.to_pandas().values if isinstance(y, cudf.Series) else y
    
    # Calculate class distributions
    class_counts = np.bincount(y_cpu)
    if verbose:
        print("Original class distribution:")
        print(class_counts)
    
    # Further reduced sampling ratios
    sampling_strategy = {
        1: int(class_counts[1] * 1.05),  
        2: int(class_counts[2] * 1.2),  
        3: int(class_counts[3] * 2.0),   
        4: int(class_counts[4] * 3.0) 
    }
    
    # Apply SMOTE with adaptive strategy
    smote = SMOTE(
        sampling_strategy=sampling_strategy,
        random_state=42,
        k_neighbors=min(5, min(class_counts[class_counts > 0]) - 1)
    )
    
    # Combine SMOTE with RandomUnderSampler
    under_sampler = RandomUnderSampler(sampling_strategy={0: int(class_counts[0] * 0.8)})
    pipeline = make_pipeline(smote, under_sampler)
    
    X_resampled, y_resampled = pipeline.fit_resample(X_cpu, y_cpu)
    
    if verbose:
        print("\nClass distribution after sampling:")
        print(np.bincount(y_resampled))
    
    return X_resampled, y_resampled

In [28]:
# Cell 4: Define Model Evaluation Function

def evaluate_model(model, X, y, dataset_name=""):
    print(f"\nEvaluating on {dataset_name}...")
    y_pred = model.predict(X)
    
    # Convert to correct types
    y = y.astype('int32')
    y_pred = cudf.Series(y_pred).astype('int32')
    
    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    conf_mat = confusion_matrix(y, y_pred)
    
    # Calculate class-specific metrics
    y_cpu = y.to_pandas()
    y_pred_cpu = y_pred.to_pandas()
    class_report = classification_report(y_cpu, y_pred_cpu, zero_division=0)
    
    print(f"Accuracy: {accuracy:.3f}")
    print("\nConfusion Matrix:")
    print(conf_mat)
    print("\nClassification Report:")
    print(class_report)
    
    # Print prediction distribution
    print("\nPrediction distribution:")
    print(y_pred.value_counts().sort_index().to_pandas())
    print("\nTrue label distribution:")
    print(y.value_counts().sort_index().to_pandas())
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_mat,
        'classification_report': class_report
    }

In [29]:
# Cell 5: Define Main Function

def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X_train, X_external, y_train, y_external = load_and_preprocess_data(
        processed_train_path, processed_test_path
    )
    
    # Split training data
    print("\nSplitting training data...")
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Feature selection
    print("\nPerforming feature selection...")
    selector = SelectKBest(f_classif, k=50)
    X_train_selected = selector.fit_transform(X_train_split.to_pandas(), y_train_split.to_pandas())
    X_val_selected = selector.transform(X_val.to_pandas())
    X_external_selected = selector.transform(X_external.to_pandas())
    
    # Balance training data
    print("\nBalancing training data...")
    X_train_balanced, y_train_balanced = adaptive_sampling(X_train_selected, y_train_split.to_pandas(), verbose=True)
    X_train_balanced = cudf.DataFrame(X_train_balanced)
    y_train_balanced = cudf.Series(y_train_balanced)
    
    # Create and train final model with Bagging
    print("\nTraining final model...")
    base_model = RandomForestClassifier(
        n_estimators=200,  
        max_depth=20,      
        min_samples_leaf=5,  
        min_samples_split=10, 
        max_features=0.8, 
        n_bins=256,
        n_streams=1,
        random_state=42,
    )
    
    # Use Bagging to improve model performance
    final_model = BaggingClassifier(
        estimator=base_model,
        n_estimators=10,  
        random_state=42,
        n_jobs=-1
    )
    
    # Train final model
    final_model.fit(X_train_balanced.to_pandas(), y_train_balanced.to_pandas())
    
    # Evaluate model
    val_metrics = evaluate_model(final_model, X_val_selected, y_val, "validation set")
    external_metrics = evaluate_model(final_model, X_external_selected, y_external, "external test set")
    
    return final_model, val_metrics, external_metrics

In [30]:
# Cell 6: Execute Main Function
if __name__ == "__main__":
    final_model, val_metrics, external_metrics = main()

Loading and preprocessing data...
Loading and preprocessing data...


Class distribution in training data:
0    67343
1    45927
2    11656
3      995
4       52
Name: multiclass_label, dtype: int64

Splitting training data...

Performing feature selection...

Balancing training data...
Original class distribution:
[53844 36806  9290   797    42]

Class distribution after sampling:
[43075 38646 11148  1594   126]

Training final model...

Evaluating on validation set...
Accuracy: 0.996

Confusion Matrix:
[[13449    11    24    14     1]
 [   11  9110     0     0     0]
 [   20     1  2345     0     0]
 [    8     0     0   190     0]
 [    7     0     0     0     3]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13499
           1       1.00      1.00      1.00      9121
           2       0.99      0.99      0.99      2366
           3       0.93      0.96      0.95       198
           4       0.75      0.30      0.43        10

    accuracy                           1.00 