# AI-Powered Network Intrusion Detection System (NIDS) using Supervised Learning

## 🔹 Step 1: Load and Explore the Dataset

In [None]:

import pandas as pd

# Load datasets
train_url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
test_url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt"

# Define column names (from NSL-KDD documentation)
col_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
    "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
    "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label","difficulty_level"
]

# Read the data
train_df = pd.read_csv(train_url, names=col_names)
test_df = pd.read_csv(test_url, names=col_names)

# Drop difficulty_level
train_df.drop("difficulty_level", axis=1, inplace=True)
test_df.drop("difficulty_level", axis=1, inplace=True)

# Show sample data
train_df.head()


## 🔹 Step 2: Preprocess the Data

In [None]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Copy to preserve originals
df_train = train_df.copy()
df_test = test_df.copy()

# ✅ Binary classification: 0 for 'normal', 1 for all other attack labels
df_train['label'] = df_train['label'].apply(lambda x: 0 if x == 'normal' else 1)
df_test['label'] = df_test['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Split features and target
X_train = df_train.drop('label', axis=1)
y_train = df_train['label']
X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

# Categorical columns
categorical_cols = ['protocol_type', 'service', 'flag']

# OneHotEncoder to handle unseen categories
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Apply encoding
X_train_encoded = column_transformer.fit_transform(X_train)
X_test_encoded = column_transformer.transform(X_test)

print("✅ Preprocessing done. X_train shape:", X_train_encoded.shape)


## 🔹 Step 3: Train the Model

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_encoded, y_train)

# Predict on test set
y_pred = clf.predict(X_test_encoded)

# Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


## 🔹 Step 4: Save and Use the Model (Optional)

In [None]:

import joblib

# Save model and preprocessor
joblib.dump(clf, 'nids_rf_model.pkl')
joblib.dump(column_transformer, 'nids_preprocessor.pkl')

# Load later using:
# model = joblib.load('nids_rf_model.pkl')
# preprocessor = joblib.load('nids_preprocessor.pkl')


In [None]:

# === Enhanced Preprocessing ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load your dataset here
# df = pd.read_csv('your_dataset.csv')

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')

# Removing outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Splitting features and labels
X = df.drop('label', axis=1)
y = df['label']

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_cols)
])


In [None]:

# === Model Pipeline and Training ===
from sklearn.calibration import CalibratedClassifierCV

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classifier pipeline
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', CalibratedClassifierCV(RandomForestClassifier(n_estimators=100), method='sigmoid'))
])

# Train the model
clf_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(clf_pipeline, 'nids_model_pipeline.pkl')


In [None]:

# === Postprocessing and Evaluation ===
# Predict probabilities and threshold
y_proba = clf_pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.7).astype(int)  # Use thresholding

# Evaluate
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("False Positives:", cm[0][1])
print("False Negatives:", cm[1][0])


In [None]:

# === Explainability with SHAP ===
import shap

# Use a sample for SHAP due to performance
X_sample = X_test[:100]

# Explain model predictions
explainer = shap.Explainer(clf_pipeline.named_steps['classifier'].base_estimator)
shap_values = explainer(clf_pipeline.named_steps['preprocessor'].transform(X_sample))

# Visualize
shap.plots.beeswarm(shap_values)
