In [55]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
import numpy as np
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from cuml.model_selection import train_test_split
from cuml.metrics.confusion_matrix import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, f1_score
from tqdm import tqdm

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/multi/KDDTest_processed.csv'

In [56]:
# Cell 2: Define Data Loading and Preprocessing Functions
def load_and_preprocess_data(train_path, test_path, verbose=True):
    """
    Load and preprocess both training and test data
    """
    if verbose:
        print("Loading and preprocessing data...")
    
    # Load training data
    df_train = cudf.read_csv(train_path)
    df_test = cudf.read_csv(test_path)
    
    # Convert feature columns to float32
    feature_cols = [col for col in df_train.columns if col != 'multiclass_label']
    df_train[feature_cols] = df_train[feature_cols].astype('float32')
    df_test[feature_cols] = df_test[feature_cols].astype('float32')
    
    # Handle missing values
    df_train = df_train.fillna(df_train.mean())
    df_test = df_test.fillna(df_test.mean())
    
    # Split features and labels
    X_train = df_train[feature_cols]
    y_train = df_train['multiclass_label'].astype('int32')
    X_test = df_test[feature_cols]
    y_test = df_test['multiclass_label'].astype('int32')
    
    if verbose:
        print("Class distribution in training data:")
        print(y_train.value_counts().sort_index().to_pandas())
    
    return X_train, X_test, y_train, y_test

In [57]:
# Cell 3: Define Improved Sampling Function
def adaptive_sampling(X, y, verbose=False):
    """
    Perform adaptive sampling with different ratios for each class
    """
    X_cpu = X.to_pandas().values if isinstance(X, cudf.DataFrame) else X
    y_cpu = y.to_pandas().values if isinstance(y, cudf.Series) else y
    
    # Calculate class distributions
    class_counts = np.bincount(y_cpu)
    if verbose:
        print("Original class distribution:")
        print(class_counts)
    
    # Calculate sampling strategy - different ratios for different classes
    sampling_strategy = {
        1: int(class_counts[1] * 1.2), 
        2: int(class_counts[2] * 2.0),  
        3: int(class_counts[3] * 5.0),  
        4: int(class_counts[4] * 10.0)  
    }
    
    # Apply SMOTE with adaptive strategy
    smote = SMOTE(
        sampling_strategy=sampling_strategy,
        random_state=42,
        k_neighbors=min(5, min(class_counts[class_counts > 0]) - 1)
    )
    
    X_resampled, y_resampled = smote.fit_resample(X_cpu, y_cpu)
    
    if verbose:
        print("\nClass distribution after sampling:")
        print(np.bincount(y_resampled))
    
    return X_resampled, y_resampled

In [58]:
# Cell 4: Define Evaluation on Imbalanced Data Function
def evaluate_on_imbalanced(model, X_val, y_val):
    y_pred = model.predict(X_val)
    # 使用macro平均而不是weighted
    score = f1_score(
        y_val.to_pandas(),
        y_pred.to_pandas(),
        average='macro'
    )
    return score

In [59]:
# Cell 5: Define Grid Search Function
def perform_grid_search(X, y, base_model):
    """
    Perform grid search with optimized parameter space
    """
    print("\nPerforming grid search...")
    
    # Optimized parameter grid
    param_grid = {
        'n_estimators': [200, 300],
        'max_depth': [20, 25, 30],
        'min_samples_split': [10, 15],
        'max_features': [0.7, 0.8, 0.9],
        'min_samples_leaf': [5, 10]
    }
    
    best_score = 0
    best_params = {}
    
    # Convert data for grid search
    X_cpu = X.to_pandas().values if isinstance(X, cudf.DataFrame) else X
    y_cpu = y.to_pandas().values if isinstance(y, cudf.Series) else y
    
    # Create CV splits
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Generate parameter combinations
    param_combinations = [
        {'n_estimators': n, 'max_depth': d, 'min_samples_split': s, 
         'max_features': f, 'min_samples_leaf': l}
        for n in param_grid['n_estimators']
        for d in param_grid['max_depth']
        for s in param_grid['min_samples_split']
        for f in param_grid['max_features']
        for l in param_grid['min_samples_leaf']
    ]
    
    # Progress bar for parameter combinations
    for params in tqdm(param_combinations, desc="Parameter combinations"):
        model = RandomForestClassifier(**params, n_bins=256, n_streams=1, random_state=42)
        scores = []
        
        # Progress bar for cross-validation
        for train_idx, val_idx in cv.split(X_cpu, y_cpu):
            X_train_fold = cudf.DataFrame(X_cpu[train_idx])
            y_train_fold = cudf.Series(y_cpu[train_idx])
            X_val_fold = cudf.DataFrame(X_cpu[val_idx])
            y_val_fold = cudf.Series(y_cpu[val_idx])
            
            # Balance training data
            X_train_balanced, y_train_balanced = adaptive_sampling(
                X_train_fold, y_train_fold, verbose=False
            )
            X_train_balanced = cudf.DataFrame(X_train_balanced)
            y_train_balanced = cudf.Series(y_train_balanced)
            
            # Train model
            model.fit(X_train_balanced, y_train_balanced)
            
            # Evaluate on original distribution
            score = evaluate_on_imbalanced(model, X_val_fold, y_val_fold)
            scores.append(score)
        
        avg_score = np.mean(scores)
        if avg_score > best_score:
            best_score = avg_score
            best_params = params
            print(f"\nNew best score: {best_score:.3f} with params: {best_params}")
    
    return best_params

In [60]:
# Cell 6: Define Model Evaluation Function
def evaluate_model(model, X, y, dataset_name=""):
    """
    Evaluate model performance with multiple metrics
    """
    print(f"\nEvaluating on {dataset_name}...")
    y_pred = model.predict(X)
    
    # Convert to correct types
    y = y.astype('int32')
    y_pred = cudf.Series(y_pred).astype('int32')
    
    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    conf_mat = confusion_matrix(y, y_pred)
    
    # Calculate class-specific metrics
    y_cpu = y.to_pandas()
    y_pred_cpu = y_pred.to_pandas()
    class_report = classification_report(y_cpu, y_pred_cpu)
    
    print(f"Accuracy: {accuracy:.3f}")
    print("\nConfusion Matrix:")
    print(conf_mat)
    print("\nClassification Report:")
    print(class_report)
    
    # Print prediction distribution
    print("\nPrediction distribution:")
    print(y_pred.value_counts().sort_index().to_pandas())
    print("\nTrue label distribution:")
    print(y.value_counts().sort_index().to_pandas())
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_mat,
        'classification_report': class_report
    }


In [61]:
# Cell 7: Define Main Function
def main():
    # Load and preprocess data
    print("Step 1/5: Loading and preprocessing data...")
    X_train, X_external, y_train, y_external = load_and_preprocess_data(
        processed_train_path, processed_test_path
    )
    
    # Split training data
    print("\nStep 2/5: Splitting training data...")
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Create base model
    print("\nStep 3/5: Creating base model...")
    base_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=25,
        min_samples_leaf=10,
        min_samples_split=15,
        max_features=0.8,
        n_bins=256,
        n_streams=1,
        random_state=42
    )
    
    # Perform grid search
    print("\nStep 4/5: Performing grid search...")
    best_params = perform_grid_search(X_train_split, y_train_split, base_model)
    
    # Create and train final model
    print("\nStep 5/5: Training final model...")
    final_model = RandomForestClassifier(
        **best_params,
        n_bins=256,
        n_streams=1,
        random_state=42
    )
    
    # Balance training data
    print("\nBalancing training data...")
    X_train_balanced, y_train_balanced = adaptive_sampling(X_train, y_train, verbose=True)
    X_train_balanced = cudf.DataFrame(X_train_balanced)
    y_train_balanced = cudf.Series(y_train_balanced)
    
    # Train final model
    print("\nTraining final model...")
    final_model.fit(X_train_balanced, y_train_balanced)
    
    # Evaluate model
    val_metrics = evaluate_model(final_model, X_val, y_val, "validation set")
    external_metrics = evaluate_model(final_model, X_external, y_external, "external test set")
    
    return final_model, val_metrics, external_metrics

In [62]:
# Cell 8: Execute Main Function
if __name__ == "__main__":
    final_model, val_metrics, external_metrics = main()

Step 1/5: Loading and preprocessing data...
Loading and preprocessing data...
Class distribution in training data:
0    67343
1    45927
2    11656
3      995
4       52
Name: multiclass_label, dtype: int64

Step 2/5: Splitting training data...

Step 3/5: Creating base model...

Step 4/5: Performing grid search...

Performing grid search...


Parameter combinations:   1%|▏         | 1/72 [00:21<25:02, 21.16s/it]


New best score: 0.935 with params: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'max_features': 0.7, 'min_samples_leaf': 5}


Parameter combinations:  10%|▉         | 7/72 [02:32<23:48, 21.98s/it]


New best score: 0.936 with params: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 15, 'max_features': 0.7, 'min_samples_leaf': 5}


Parameter combinations:  51%|█████▏    | 37/72 [13:46<14:24, 24.69s/it]


New best score: 0.941 with params: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 10, 'max_features': 0.7, 'min_samples_leaf': 5}


Parameter combinations:  68%|██████▊   | 49/72 [19:57<11:51, 30.94s/it]


New best score: 0.941 with params: {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 10, 'max_features': 0.7, 'min_samples_leaf': 5}


Parameter combinations: 100%|██████████| 72/72 [31:56<00:00, 26.61s/it]



Step 5/5: Training final model...

Balancing training data...
Original class distribution:
[67343 45927 11656   995    52]

Class distribution after sampling:
[67343 55112 23312  4975   520]

Training final model...

Evaluating on validation set...
Accuracy: 1.000

Confusion Matrix:
[[13493     3     3     0     0]
 [    1  9120     0     0     0]
 [    2     0  2364     0     0]
 [    1     0     0   196     1]
 [    1     0     0     0     9]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13499
           1       1.00      1.00      1.00      9121
           2       1.00      1.00      1.00      2366
           3       1.00      0.99      0.99       198
           4       0.90      0.90      0.90        10

    accuracy                           1.00     25194
   macro avg       0.98      0.98      0.98     25194
weighted avg       1.00      1.00      1.00     25194


Prediction distribution:
0    13498

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
