In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. DEFINE HEADERS AND SELECTED FEATURES
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'level'
]

selected_features = [
    'src_bytes', 'same_srv_rate', 'flag', 'dst_host_serror_rate', 'srv_serror_rate', 
    'dst_host_same_srv_rate', 'diff_srv_rate', 'count', 'dst_host_srv_serror_rate', 
    'serror_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
    'dst_bytes', 'dst_host_diff_srv_rate', 'protocol_type', 'dst_host_srv_count', 
    'service', 'srv_count', 'dst_host_count', 'dst_host_rerror_rate'
]

local_path = "/kaggle/input/nslkdd/"
train_path = "/kaggle/input/nsl-kdd-augmented/"

# 2. LOAD DATASETS
df_train = pd.read_csv(train_path + "smote_augmented.csv")
df_test = pd.read_csv(local_path + "KDDTest+.txt", header=None, names=columns)

# 3. FILTER TEST SET LABELS (Keep only labels present in Training)
train_labels = df_train['outcome'].unique()
df_test_filtered = df_test[df_test['outcome'].isin(train_labels)].copy()

# 4. SUBSET TO SELECTED FEATURES
# Extract Target
y_train = df_train['outcome']
y_test = df_test_filtered['outcome']

# Extract only selected features
X_train = df_train[selected_features].copy()
X_test = df_test_filtered[selected_features].copy()

print(f"Features used: {len(X_train.columns)}")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# 5. PREPROCESSING (Only for selected features)
# Identify which of the selected features are categorical
all_categorical = ['protocol_type', 'service', 'flag']
categorical_cols = [col for col in selected_features if col in all_categorical]
numerical_cols = [col for col in selected_features if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Fit/Transform features
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

# Encode targets
le = LabelEncoder()
y_train_final = le.fit_transform(y_train)
y_test_final = le.transform(y_test)

Features used: 20
Train shape: (557934, 20), Test shape: (18794, 20)


In [3]:
X_train = X_train_final
y_train_enc = y_train_final
X_test = X_test_final
y_test_enc = y_test_final

In [4]:
rf_model = RandomForestClassifier(
    n_estimators=150,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    tree_method='hist', # Faster for large datasets
    device='cuda' # Use if you have a GPU
)

import lightgbm as lgb

lgbm_model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42
)

from sklearn.svm import SVC

svm_model = SVC(
    kernel='rbf', 
    probability=True, # Set to True if you need ROC-AUC metrics
    class_weight='balanced',
    random_state=42
)

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(
    n_neighbors=5,
    n_jobs=-1
)

In [8]:
xgb_model.fit(X_train, y_train_enc)

  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
y_pred = xgb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test_enc, y_pred))

Accuracy: 0.8392040012770033


In [10]:
import numpy as np

# Get labels actually present in test set
test_labels = np.unique(y_test_enc)

print("\nClassification Report:\n",
      classification_report(
          y_test_enc,
          y_pred,
          labels=test_labels,
          target_names=le.classes_
      ))


Classification Report:
                  precision    recall  f1-score   support

           back       0.62      1.00      0.77       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.80      0.00      0.01      1231
           imap       0.00      0.00      0.00         1
        ipsweep       0.91      0.99      0.95       141
           land       1.00      0.57      0.73         7
     loadmodule       0.00      0.00      0.00         2
       multihop       0.00      0.00      0.00        18
        neptune       1.00      0.98      0.99      4657
           nmap       0.84      1.00      0.91        73
         normal       0.89      0.93      0.91      9711
           perl       0.03      0.50      0.05         2
            phf       0.00      0.50      0.01         2
            pod       0.49      0.88      0.63        41
      portsweep       0.72      0.90      0.80       157
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
