In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. CORRECT HEADER ORDER
# In NSL-KDD, the 42nd column is 'outcome' (attack) and 43rd is 'level' (score)
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'level'
]

local_path = "/kaggle/input/nslkdd/"
train_path = "/kaggle/input/nsl-kdd-augmented/"

# 2. LOAD DATASETS
# Load augmented training data (it was saved with headers in your notebook)
df_train = pd.read_csv(train_path + "smote_augmented.csv")

# Load raw test data (it does NOT have headers)
df_test = pd.read_csv(local_path + "KDDTest+.txt", header=None, names=columns)

# 3. FILTER TEST SET LABELS
# Retain only those rows in the test set where the attack exists in training
train_labels = df_train['outcome'].unique()
df_test_filtered = df_test[df_test['outcome'].isin(train_labels)].copy()

print(f"Original test samples: {len(df_test)}")
print(f"Filtered test samples: {len(df_test_filtered)}")

# 4. SEPARATE FEATURES AND TARGET
X_train = df_train.drop(columns=['outcome', 'level'], errors='ignore')
y_train = df_train['outcome']

X_test = df_test_filtered.drop(columns=['outcome', 'level'], errors='ignore')
y_test = df_test_filtered['outcome']

# 5. PREPROCESSING
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [c for c in X_train.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Fit/Transform features
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

# Encode targets
le = LabelEncoder()
y_train_final = le.fit_transform(y_train)
y_test_final = le.transform(y_test) # This will now work!

Original test samples: 22544
Filtered test samples: 18794


In [3]:
X_train = X_train_final
y_train_enc = y_train_final
X_test = X_test_final
y_test_enc = y_test_final

In [4]:
rf_model = RandomForestClassifier(
    n_estimators=150,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    tree_method='hist', # Faster for large datasets
    device='cuda' # Use if you have a GPU
)

import lightgbm as lgb

lgbm_model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42
)

from sklearn.svm import SVC

svm_model = SVC(
    kernel='rbf', 
    probability=True, # Set to True if you need ROC-AUC metrics
    class_weight='balanced',
    random_state=42
)

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(
    n_neighbors=5,
    n_jobs=-1
)

In [5]:
lgbm_model.fit(X_train, y_train_enc)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5768
[LightGBM] [Info] Number of data points in the train set: 557934, number of used features: 115
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.135494
[LightGBM] [Info] Start training from score -3.1354

In [6]:
y_pred = lgbm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test_enc, y_pred))



Accuracy: 0.756198786846866


In [7]:
import numpy as np

# Get labels actually present in test set
test_labels = np.unique(y_test_enc)

print("\nClassification Report:\n",
      classification_report(
          y_test_enc,
          y_pred,
          labels=test_labels,
          target_names=le.classes_
      ))


Classification Report:
                  precision    recall  f1-score   support

           back       0.38      1.00      0.56       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.87      0.04      0.07      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.38      0.96      0.54       141
           land       0.58      1.00      0.74         7
     loadmodule       0.02      1.00      0.03         2
       multihop       0.06      0.11      0.07        18
        neptune       0.99      0.95      0.97      4657
           nmap       0.38      0.96      0.55        73
         normal       0.89      0.81      0.85      9711
           perl       0.02      0.50      0.04         2
            phf       0.00      0.50      0.01         2
            pod       0.12      0.66      0.21        41
      portsweep       0.56      0.59      0.58       157
     

