In [None]:
# Cell 1: Import Libraries and Setup Paths
import cudf
import cupy as cp
import numpy as np
from cuml.ensemble import RandomForestClassifier
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from cuml.model_selection import GridSearchCV

# Define paths
processed_train_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTrain_processed.csv'
processed_test_path = '/root/autodl-tmp/projects/SL_NSL/dataset/processed/KDDTest_processed.csv'

In [45]:
# Cell 2: Load and Split Data
print("Loading processed training data...")
df_train = cudf.read_csv(processed_train_path)

# Define type conversion for feature columns
feature_columns = [col for col in df_train.columns if col not in ['label', 'binary_label']]
df_train[feature_columns] = df_train[feature_columns].astype('float32')

# Select features and labels
X = df_train.drop(columns=['label', 'binary_label'])
y_binary = df_train['binary_label'].astype('int32')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

print("Training and test sets split completed")
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of test samples: {X_test.shape[0]}")
print(f"Feature data type: {X_train.dtypes[0]}")
print(f"Label data type: {y_train.dtype}")

Loading processed training data...
Training and test sets split completed
Number of training samples: 100779
Number of test samples: 25194
Feature data type: float32
Label data type: int32


In [46]:
# Cell 3: Create Enhanced Random Forest Classifier
rf_binary = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=10,
    min_samples_split=20,
    max_features='sqrt',
    n_bins=256,  # Added for GPU optimization
    n_streams=1,     # Added for reproducibility
    random_state=42
)

In [47]:
# Cell 4: Setup Sampling and Grid Search
# Initialize samplers
smote = SMOTE(random_state=42, sampling_strategy='auto')

# Parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [8, 10, 12, 15, 20],
    'min_samples_split': [15, 20, 25, 30],
    'max_features': ['sqrt', 'auto', 0.3, 0.5, 0.7], 
    'min_samples_leaf': [5, 10, 15],
    'n_bins': [128, 256]
}

# Create stratified K-fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Custom scorer for GPU
def gpu_f1_score(y_true, y_pred):
    y_true_cpu = y_true.to_pandas() if isinstance(y_true, cudf.Series) else y_true
    y_pred_cpu = y_pred.to_pandas() if isinstance(y_pred, cudf.Series) else y_pred
    return f1_score(y_true_cpu, y_pred_cpu, average='weighted')

scorer = make_scorer(gpu_f1_score)

In [48]:
# Cell 5: Create Grid Search
grid_search = GridSearchCV(
    estimator=rf_binary,
    param_grid=param_grid,
    cv=cv,
    scoring=scorer,
    verbose=1
)

In [49]:
# Cell 6: Training and Evaluation Function
def train_and_evaluate(X_train, X_test, y_train, y_test, X_external=None, y_external=None):
    # Convert data to CPU for sampling and ensure float32, labels as int32
    X_train_cpu = X_train.to_numpy().astype('float32')
    y_train_cpu = y_train.to_numpy().astype('int32')
    
    # Apply SMOTE to balance the dataset
    print("Applying SMOTE to balance training data...")
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_cpu, y_train_cpu)
    
    # Convert back to GPU for training
    X_train_gpu = cudf.DataFrame(X_train_balanced).astype('float32')
    y_train_gpu = cudf.Series(y_train_balanced).astype('int32')
    
    # Train model directly without grid search first
    print("Training initial model...")
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_leaf=10,
        min_samples_split=20,
        max_features='sqrt',
        n_bins=128,
        n_streams=1,
        random_state=42
    )
    model.fit(X_train_gpu, y_train_gpu)
    
    # Manual grid search implementation
    print("\nPerforming manual grid search...")
    best_score = 0
    best_params = {}
    best_model = None
    
    for n_est in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for min_split in param_grid['min_samples_split']:
                for max_feat in param_grid['max_features']:
                    current_model = RandomForestClassifier(
                        n_estimators=n_est,
                        max_depth=depth,
                        min_samples_split=min_split,
                        max_features=max_feat,
                        min_samples_leaf=10,
                        n_bins=128,
                        n_streams=1,
                        random_state=42
                    )
                    
                    # Train and evaluate
                    current_model.fit(X_train_gpu, y_train_gpu)
                    y_pred = current_model.predict(X_test)
                    score = gpu_f1_score(y_test, y_pred)
                    
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'n_estimators': n_est,
                            'max_depth': depth,
                            'min_samples_split': min_split,
                            'max_features': max_feat
                        }
                        best_model = current_model
                        print(f"New best score: {best_score:.3f} with params: {best_params}")
    
    print("\nBest parameters:", best_params)
    
    # Evaluate on internal test set
    print("\nInternal Test Set Evaluation:")
    y_pred = best_model.predict(X_test)

    # Ensure all labels are int32 for metrics calculation
    y_test_int = y_test.astype('int32')
    y_pred_int = y_pred.astype('int32')
    
    # Calculate metrics
    internal_metrics = {
        'accuracy': accuracy_score(y_test_int, y_pred_int),
        'confusion_matrix': confusion_matrix(y_test_int, y_pred_int),
        'classification_report': classification_report(
            y_test_int.to_numpy(), 
            y_pred_int.to_numpy()
        )
    }
    
    # Print internal evaluation results
    print(f"Accuracy: {internal_metrics['accuracy']:.3f}")
    print("\nConfusion Matrix:")
    print(internal_metrics['confusion_matrix'])
    print("\nClassification Report:")
    print(internal_metrics['classification_report'])
    
    # Calculate cross-validation scores
    print("\nPerforming cross-validation...")
    cv_scores = []
    for train_idx, val_idx in cv.split(X_train_gpu.to_numpy(), y_train_gpu.to_numpy()):
        # Convert indices to GPU
        X_fold_train = X_train_gpu.iloc[train_idx]
        y_fold_train = y_train_gpu.iloc[train_idx]
        X_fold_val = X_train_gpu.iloc[val_idx]
        y_fold_val = y_train_gpu.iloc[val_idx]
        
        # Apply sampling to fold
        X_fold_train_cpu = X_fold_train.to_numpy()
        y_fold_train_cpu = y_fold_train.to_numpy()
        X_fold_balanced, y_fold_balanced = smote.fit_resample(X_fold_train_cpu, y_fold_train_cpu)
        
        # Convert back to GPU
        X_fold_train_gpu = cudf.DataFrame(X_fold_balanced)
        y_fold_train_gpu = cudf.Series(y_fold_balanced)
        
        # Train and evaluate
        fold_model = best_model.fit(X_fold_train_gpu, y_fold_train_gpu)
        y_fold_pred = fold_model.predict(X_fold_val)
        cv_scores.append(gpu_f1_score(y_fold_val, y_fold_pred))
    
    cv_scores = np.array(cv_scores)
    print(f"Cross-validation F1 scores: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    # External evaluation   
    if X_external is not None and y_external is not None:
        print("\nExternal Test Set Evaluation:")
        y_external_pred = best_model.predict(X_external)

        y_external_int = y_external.astype('int32')
        y_external_pred_int = y_external_pred.astype('int32')
        
        external_metrics = {
            'accuracy': accuracy_score(y_external_int, y_external_pred_int),
            'confusion_matrix': confusion_matrix(y_external_int, y_external_pred_int),
            'classification_report': classification_report(
                y_external_int.to_numpy(),
                y_external_pred_int.to_numpy()
            )
        }
        
        print(f"Accuracy: {external_metrics['accuracy']:.3f}")
        print("\nConfusion Matrix:")
        print(external_metrics['confusion_matrix'])
        print("\nClassification Report:")
        print(external_metrics['classification_report'])
        
        return best_model, internal_metrics, external_metrics
    
    return best_model, internal_metrics

In [50]:
# Cell 7: Early Stopping Monitor
def monitor_training(model, X_val, y_val, patience=3):
    """Monitor training process and implement early stopping"""
    best_score = 0
    patience_counter = 0
    
    for i in range(model.n_estimators):
        model.n_estimators = i + 1
        model.fit(X_val, y_val)
        
        # Calculate score on GPU
        y_pred = model.predict(X_val)
        current_score = accuracy_score(y_val, y_pred)
        
        if current_score > best_score:
            best_score = current_score
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f"Early stopping at iteration {i}")
            break
    
    return model

In [51]:
# Cell 8: Run Training and Evaluation
# Load external test data
df_test = cudf.read_csv(processed_test_path)

# Convert numeric columns to float32
numeric_columns = df_test.select_dtypes(include=['float64', 'int64']).columns
df_test[numeric_columns] = df_test[numeric_columns].astype('float32')

X_external = df_test.drop(columns=['label', 'binary_label']).astype('float32')
y_external = df_test['binary_label'].astype('int32')

# Run training and evaluation
best_model, internal_metrics, external_metrics = train_and_evaluate(
    X_train, X_test, y_train, y_test,
    X_external=X_external,
    y_external=y_external
)

# Print final results summary
print("\nFinal Model Training Complete!")
print(f"Internal Test Accuracy: {internal_metrics['accuracy']:.3f}")
print(f"External Test Accuracy: {external_metrics['accuracy']:.3f}")

Applying SMOTE to balance training data...
Training initial model...

Performing manual grid search...
New best score: 0.986 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 15, 'max_features': 'sqrt'}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 15, 'max_features': 0.3}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 15, 'max_features': 0.7}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 25, 'max_features': 0.3}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 25, 'max_features': 0.7}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 30, 'max_features': 0.3}
New best score: 0.997 with params: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 30, 'max_features': 0.5}
New best score: 0.998 with params: {'n_estimators': 50, 'max_depth':