To Learn more about dataset-
*https://github.com/HoaNP/NSL-KDD-DataSet*

#### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [None]:
train = pd.read_csv("../data/KDDTrain.txt", header=None)
test  = pd.read_csv("../data/KDDTest.txt", header=None)

#### Common Pre-Processing

In [None]:
feature_names = [
 'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
 'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
 'root_shell','su_attempted','num_root','num_file_creations','num_shells',
 'num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count',
 'srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
 'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
 'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
]

cols = feature_names + ['label', 'difficulty']

train = pd.read_csv('../data/KDDTrain.txt', names=cols)
test  = pd.read_csv('../data/KDDTest.txt',  names=cols)

In [None]:
train.drop('difficulty', inplace=True, axis=1)
test.drop('difficulty', inplace=True, axis=1)

#### Training Data Pre-Processing

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isnull().sum()        # Therefore No IMPUTATIONS required...

In [None]:
vals = train['protocol_type'].value_counts()
vals

In [None]:
cat = train['protocol_type'].unique()
plt.bar(cat, vals, color='salmon')
plt.title('Protocol_Type Distribution in NSL-KDD Train Set')
plt.xlabel('protocol_type')
plt.ylabel('Count')
plt.show()

In [None]:
train['label'].apply(lambda x: 'normal' if x=='normal' else 'attack').value_counts().plot(kind='bar', color=['skyblue','salmon'])
plt.title('Label Distribution in NSL-KDD Train Set')
plt.ylabel('Count')
plt.show()


Since your target (normal vs attack) is well balanced, the feature-level imbalance in protocol_type (tcp > udp > icmp) is not critical to fix.

Why:

- Random Forests handle categorical skew reasonably well.

- What matters most is the label balance, which is good in your case.

- You only need to be aware that tcp dominates; the model might rely slightly more on it, but for now it’s fine.

#### Test Data Pre-Processing

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.isnull().sum() 

In [None]:
cat = test['protocol_type'].unique()
plt.bar(cat, vals, color='salmon')
plt.title('Protocol_Type Distribution in NSL-KDD Test Set')
plt.xlabel('protocol_type')
plt.ylabel('Count')
plt.show()

In [None]:
test['label'].apply(lambda x: 'normal' if x=='normal' else 'attack').value_counts().plot(kind='bar', color=['skyblue','salmon'])
plt.title('Label Distribution in NSL-KDD Test Set')
plt.ylabel('Count')
plt.show()


In [None]:
test['label'].apply(lambda x: 'normal' if x=='normal' else 'attack').value_counts()