In [1]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.
bigfr

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import gc

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

# Optimize for Kaggle
tf.config.optimizer.set_jit(True)  # Enable XLA
tf.config.experimental.enable_memory_growth = True

ImportError: cannot import name '_safe_tags' from 'sklearn.utils._tags' (/usr/local/lib/python3.11/dist-packages/sklearn/utils/_tags.py)

In [None]:
# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load the dataset
print("Loading the dataset...")
file_path = "/kaggle/input/navbot25-v8/NavBot25_V8.csv"
data = pd.read_csv(file_path)

# Attack mapping
attack_mapping = {
    "Normal": 0,
    "DoS Attack": 1,
    "UnauthSub Attack": 2,
    "SSH Bruteforce": 3,
    "Pubflood": 4,
    "Subflood": 5,
    "Reverse Shell": 6,
    "Port Scanning Attack": 7
}


In [None]:
# Apply mapping
data["Label"] = data["Label"].map(attack_mapping)
data = data.dropna(subset=["Label"])
data["Label"] = data["Label"].astype(int)

# Drop irrelevant columns
columns_to_drop = ['Flow ID', 'Src IP', 'Dst IP', 'Protocol', 'Timestamp']
data = data.drop(columns=columns_to_drop, errors='ignore')

# Fill missing values
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Split into features and labels
X = data.drop('Label', axis=1)
y = data['Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Handle inf and nan
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

In [None]:
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# Reshape for CNN+LSTM (Feature Extraction)
timesteps = 1
n_features = X_train_scaled.shape[1]
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], timesteps, n_features)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], timesteps, n_features)

# One-hot encode labels for CNN+LSTM training
num_classes = len(attack_mapping)
y_train_onehot = to_categorical(y_train_balanced, num_classes)
y_test_onehot = to_categorical(y_test, num_classes)


In [None]:
# =============================================================================
# LIGHTWEIGHT FEATURE EXTRACTION: Simplified CNN+LSTM
# =============================================================================
def create_lightweight_feature_extractor(input_shape):
    """Create lightweight CNN+LSTM model for feature extraction"""
    inputs = Input(shape=input_shape)
    
    # Simplified CNN layers
    x = Conv1D(32, kernel_size=1, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Single LSTM layer
    x = LSTM(50, return_sequences=False)(x)
    x = Dropout(0.3)(x)
    
    # Smaller feature extraction layer
    features = Dense(32, activation='relu', name='extracted_features')(x)
    features = BatchNormalization()(features)
    features = Dropout(0.3)(features)
    
    # Classification head
    outputs = Dense(num_classes, activation='softmax')(features)
    
    model = Model(inputs, outputs)
    feature_extractor = Model(inputs, features)
    
    return model, feature_extractor

print("Creating lightweight CNN+LSTM feature extraction model...")
full_model, feature_extractor = create_lightweight_feature_extractor((timesteps, n_features))

# Compile with reduced learning rate
full_model.compile(
    optimizer=Adam(0.0005),  # Reduced learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("Training CNN+LSTM feature extractor (reduced epochs)...")
# Reduced training parameters
history = full_model.fit(
    X_train_reshaped, y_train_onehot,
    validation_split=0.15,  # Smaller validation split
    epochs=20,              # Reduced epochs
    batch_size=64,          # Larger batch size
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

# Extract features
print("Extracting features...")
X_train_features = feature_extractor.predict(X_train_reshaped, batch_size=128)
X_test_features = feature_extractor.predict(X_test_reshaped, batch_size=128)

# Memory cleanup
del X_train_reshaped, X_test_reshaped, full_model
gc.collect()

print(f"Original feature shape: {X_train_scaled.shape}")
print(f"Extracted feature shape: {X_train_features.shape}")


In [None]:
# =============================================================================
# SIMPLIFIED DIMENSIONALITY REDUCTION: Use PCA instead of Kernel PCA
# =============================================================================
print("Applying PCA for dimensionality reduction...")
pca = PCA(n_components=16, random_state=42)  # Reduced components
X_train_pca = pca.fit_transform(X_train_features)
X_test_pca = pca.transform(X_test_features)

print(f"After PCA: {X_train_pca.shape}")

# Memory cleanup
del X_train_features, X_test_features
gc.collect()

In [None]:
# =============================================================================
# PHASE 1: Simplified KNN and Random Forest
# =============================================================================
print("\n" + "="*60)
print("PHASE 1: Training KNN and Random Forest (Simplified)")
print("="*60)

# Simplified classifiers
knn_classifier = KNeighborsClassifier(n_neighbors=3)  # Reduced neighbors
rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)  # Reduced trees

# Train classifiers
print("Training simplified classifiers...")
knn_classifier.fit(X_train_pca, y_train_balanced)
rf_classifier.fit(X_train_pca, y_train_balanced)

# Get predictions and probabilities
knn_proba_train = knn_classifier.predict_proba(X_train_pca)
knn_proba_test = knn_classifier.predict_proba(X_test_pca)
rf_proba_train = rf_classifier.predict_proba(X_train_pca)
rf_proba_test = rf_classifier.predict_proba(X_test_pca)

knn_pred_test = knn_classifier.predict(X_test_pca)
rf_pred_test = rf_classifier.predict(X_test_pca)

print(f"KNN Test Accuracy: {accuracy_score(y_test, knn_pred_test):.4f}")
print(f"Random Forest Test Accuracy: {accuracy_score(y_test, rf_pred_test):.4f}")

# =============================================================================
# FEATURE FUSION: Simplified
# =============================================================================
print("\nPerforming Feature Fusion...")

# Simplified fusion - only probabilities
train_fused_features = np.concatenate([
    X_train_pca,
    knn_proba_train,
    rf_proba_train
], axis=1)

test_fused_features = np.concatenate([
    X_test_pca,
    knn_proba_test,
    rf_proba_test
], axis=1)

print(f"Fused feature shape: {train_fused_features.shape}")

# Memory cleanup
del X_train_pca, X_test_pca, knn_proba_train, knn_proba_test, rf_proba_train, rf_proba_test
gc.collect()

In [None]:
# =============================================================================
# PHASE 2: Final Logistic Regression
# =============================================================================
print("\n" + "="*60)
print("PHASE 2: Training Final Logistic Regression")
print("="*60)

lr_classifier = LogisticRegression(max_iter=500, random_state=42)  # Reduced iterations
lr_classifier.fit(train_fused_features, y_train_balanced)

final_pred_test = lr_classifier.predict(test_fused_features)

In [None]:
# =============================================================================
# SIMPLIFIED 3-FOLD CROSS VALIDATION (instead of 5-fold)
# =============================================================================
from sklearn.model_selection import StratifiedKFold

print("\n" + "="*60)
print("5-FOLD CROSS VALIDATION - FINAL MODEL (Kaggle Optimized)")
print("="*60)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Reduced to 3-fold
cv_scores = []
train_scores = []

fold_num = 1
for train_idx, val_idx in cv.split(train_fused_features, y_train_balanced):
    print(f"Processing Fold {fold_num}/5...")
    
    # Split data
    X_train_fold = train_fused_features[train_idx]
    X_val_fold = train_fused_features[val_idx]
    y_train_fold = y_train_balanced.iloc[train_idx] if hasattr(y_train_balanced, 'iloc') else y_train_balanced[train_idx]
    y_val_fold = y_train_balanced.iloc[val_idx] if hasattr(y_train_balanced, 'iloc') else y_train_balanced[val_idx]
    
    # Train classifier
    lr_fold = LogisticRegression(max_iter=300, random_state=42)
    lr_fold.fit(X_train_fold, y_train_fold)
    
    # Evaluate
    val_score = lr_fold.score(X_val_fold, y_val_fold)
    train_score = lr_fold.score(X_train_fold, y_train_fold)
    
    cv_scores.append(val_score)
    train_scores.append(train_score)
    
    print(f"   Fold {fold_num} - Training: {train_score:.4f}, Testing: {val_score:.4f}")
    fold_num += 1

cv_scores = np.array(cv_scores)
train_scores = np.array(train_scores)

print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean validation accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard deviation: {np.std(cv_scores):.4f}")

In [None]:
# =============================================================================
# RESULTS AND VISUALIZATION
# =============================================================================
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(f"Final Test Accuracy: {accuracy_score(y_test, final_pred_test):.4f}")
print(f"Cross-Validation Mean: {np.mean(cv_scores):.4f}")
print("\nClassification Report:")
# Get the classification report as a dictionary
report_dict = classification_report(y_test, final_pred_test, 
                                  target_names=list(attack_mapping.keys()),
                                  output_dict=True)

# Custom formatting function to display percentages
def format_percentage(value):
    return f"{value * 100:.2f}"

# Print formatted classification report
print(f"{'':20} {'precision':>10} {'recall':>10} {'f1-score':>10} {'support':>10}")
print("-" * 65)

for class_name in list(attack_mapping.keys()):
    if class_name in report_dict:
        precision = format_percentage(report_dict[class_name]['precision'])
        recall = format_percentage(report_dict[class_name]['recall'])
        f1_score = format_percentage(report_dict[class_name]['f1-score'])
        support = report_dict[class_name]['support']
        print(f"{class_name:20} {precision:>10} {recall:>10} {f1_score:>10} {support:>10.0f}")

print("-" * 65)
# Print accuracy, macro avg, and weighted avg
accuracy = format_percentage(report_dict['accuracy'])
macro_precision = format_percentage(report_dict['macro avg']['precision'])
macro_recall = format_percentage(report_dict['macro avg']['recall'])
macro_f1 = format_percentage(report_dict['macro avg']['f1-score'])
macro_support = report_dict['macro avg']['support']

weighted_precision = format_percentage(report_dict['weighted avg']['precision'])
weighted_recall = format_percentage(report_dict['weighted avg']['recall'])
weighted_f1 = format_percentage(report_dict['weighted avg']['f1-score'])
weighted_support = report_dict['weighted avg']['support']

print(f"{'accuracy':20} {'':<10} {'':<10} {accuracy:>10} {macro_support:>10.0f}")
print(f"{'macro avg':20} {macro_precision:>10} {macro_recall:>10} {macro_f1:>10} {macro_support:>10.0f}")
print(f"{'weighted avg':20} {weighted_precision:>10} {weighted_recall:>10} {weighted_f1:>10} {weighted_support:>10.0f}")

# Simplified visualization
plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), cv_scores, marker='o', label='Testing Accuracy', 
         color='blue', linestyle='-', linewidth=2, markersize=8)
plt.plot(range(1, 6), train_scores, marker='x', label='Training Accuracy', 
         color='red', linestyle='--', linewidth=2, markersize=10)

# Add labels for validation accuracy points
for i, score in enumerate(cv_scores):
    plt.annotate(f'{score:.4f}', (i+1, score), textcoords="offset points", 
                xytext=(5,5), ha='left', fontsize=9, color='blue')

# Add labels for training accuracy points
for i, score in enumerate(train_scores):
    plt.annotate(f'{score:.4f}', (i+1, score), textcoords="offset points", 
                xytext=(5,-15), ha='left', fontsize=9, color='red')

plt.title('5-Fold Cross-Validation Results (R-NIDS)', fontsize=12, fontweight='bold')
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Confusion Matrix with Percentages (2 decimal places)
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, final_pred_test)
# Convert to percentages
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
# Create the heatmap with percentages (2 decimal places)
sns.heatmap(cm_percentage, annot=True, fmt='.2f', cmap='Blues', 
            xticklabels=list(attack_mapping.keys()),
            yticklabels=list(attack_mapping.keys()),
            cbar_kws={'label': 'Percentage (%)'})
plt.title('Confusion Matrix (Percentages)\nR-NIDS', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Optional: Also show raw counts confusion matrix for reference
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=list(attack_mapping.keys()),
            yticklabels=list(attack_mapping.keys()),
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix (Raw Counts)\nR-NIDS', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("KAGGLE-OPTIMIZED PIPELINE COMPLETED!")
print("="*60)