In [1]:
# Data Handling and Preprocessing
import pandas as pd            # DataFrame structure and data handling
import numpy as np             # Numerical operations

# Data Preprocessing
from sklearn.model_selection import train_test_split    # Train-test split
from sklearn.preprocessing import StandardScaler        # Feature scaling
from sklearn.preprocessing import OneHotEncoder         # Encoding categorical features
from imblearn.over_sampling import SMOTE                # Handling class imbalance

# Model Training
from sklearn.ensemble import RandomForestClassifier     # Random Forest
from sklearn.svm import SVC                             # Support Vector Machine
from xgboost import XGBClassifier                       # XGBoost model

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score   # Evaluation metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc           # Additional metrics and ROC

# Model Optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV   # Hyperparameter tuning

# Visualization
import matplotlib.pyplot as plt     # Data visualization
import seaborn as sns               # Enhanced visualization

# Interpretability
import shap                          # SHAP for feature importance

# System
import warnings                      # To handle warnings
warnings.filterwarnings('ignore')


In [5]:
# Define column names for the NSL-KDD dataset
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
    'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label', 'difficulty'
]


In [7]:
# Load the training data
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sarmad Project/Dataset/KDDTrain+.txt', names=column_names, sep=",", header=None)

# Load the testing data
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sarmad Project/Dataset/KDDTest+.txt', names=column_names, sep=",", header=None)


In [8]:
# Display the first few rows of the training data
print("Training Data:")
print(train_data.head())

# Display the first few rows of the testing data
print("Testing Data:")
print(test_data.head())


Training Data:
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.17   
1               0       0    0  ...                    0.00   
2               0       0    0  ...                    0.10   
3               0       0    0  ...                    1.00   
4               0       0    0  ...                    1.00   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                    0.03                         0.17   
1                    0.60                         0.88   

In [9]:
# Check the shape to ensure we have all columns
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)

Training Data Shape: (125973, 43)
Testing Data Shape: (22544, 43)


In [10]:
# Summary of the label distribution
print("Training Data Label Distribution:")
print(train_data['label'].value_counts())
print("Testing Data Label Distribution:")
print(test_data['label'].value_counts())

Training Data Label Distribution:
label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64
Testing Data Label Distribution:
label
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgeta