In [9]:
import pandas as pd
import numpy as np

url_train = "https://raw.githubusercontent.com/Jehuty4949/NSL_KDD/master/KDDTrain%2B.txt"
url_test = "https://raw.githubusercontent.com/Jehuty4949/NSL_KDD/master/KDDTest%2B.txt"
columns_url = "https://raw.githubusercontent.com/Jehuty4949/NSL_KDD/master/Field%20Names.csv"

# Load column names
columns = pd.read_csv(columns_url, sep=',', header=None)
col_names = columns[0].tolist() + ['class', 'type']

# Load train and test data
train_df = pd.read_csv(url_train, names=col_names)
test_df = pd.read_csv(url_test, names=col_names)

# Drop the "type" column (not needed)
train_df.drop(columns=['type'], inplace=True)
test_df.drop(columns=['type'], inplace=True)

# Strip whitespace from class labels
for df in [train_df, test_df]:
    df['class'] = df['class'].str.strip()

# Combine train and test
full_df = pd.concat([train_df, test_df], ignore_index=True)
print(f"✔️ Combined dataset shape: {full_df.shape}")
print(f"🧾 Unique class labels: {full_df['class'].unique()}")

✔️ Combined dataset shape: (148517, 42)
🧾 Unique class labels: ['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl' 'saint' 'mscan' 'apache2' 'snmpgetattack'
 'processtable' 'httptunnel' 'ps' 'snmpguess' 'mailbomb' 'named'
 'sendmail' 'xterm' 'worm' 'xlock' 'xsnoop' 'sqlattack' 'udpstorm']


In [10]:

# For NSL_KDD Dataset 
# Only 
category_map = {
    # Normal
    'normal': 'normal',

    # DoS
    'neptune': 'DoS', 'teardrop': 'DoS', 'smurf': 'DoS', 'pod': 'DoS', 'back': 'DoS', 'land': 'DoS',
    'apache2': 'DoS', 'mailbomb': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS',

    # Probe
    'ipsweep': 'Probe', 'portsweep': 'Probe', 'nmap': 'Probe', 'satan': 'Probe', 'saint': 'Probe',
    'mscan': 'Probe', 'snmpgetattack': 'Probe', 'snmpguess': 'Probe',

    # R2L
    'guess_passwd': 'R2L', 'ftp_write': 'R2L', 'imap': 'R2L', 'multihop': 'R2L', 'warezclient': 'R2L',
    'warezmaster': 'R2L', 'phf': 'R2L', 'spy': 'R2L', 'xlock': 'R2L', 'xsnoop': 'R2L', 'sendmail': 'R2L',

    # U2R
    'rootkit': 'U2R', 'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'perl': 'U2R', 'httptunnel': 'U2R',
    'ps': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R', 'worm': 'U2R', 'named': 'U2R'
}

# Apply mapping
full_df['class'] = full_df['class'].map(category_map)

# Check the result
print("Grouped classes:", full_df['class'].unique())


Grouped classes: ['normal' 'DoS' 'R2L' 'Probe' 'U2R']


In [11]:
full_df.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,DoS
6,0,tcp,private,S0,0,0,0,0,0,0,...,9,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
7,0,tcp,private,S0,0,0,0,0,0,0,...,15,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0,DoS
8,0,tcp,remote_job,S0,0,0,0,0,0,0,...,23,0.09,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS
9,0,tcp,private,S0,0,0,0,0,0,0,...,13,0.05,0.06,0.0,0.0,1.0,1.0,0.0,0.0,DoS


In [12]:
target = full_df['class']
features = full_df.drop(columns=['class'])

features.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0,udp,other,SF,146,0,0,0,0,0,...,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0,tcp,private,S0,0,0,0,0,0,0,...,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0,tcp,http,SF,232,8153,0,0,0,0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from preprocess import Preprocess

pp = Preprocess()
processed_features = pp.fit_transform_df_auto(df = features,n_categorical_levels=64, expected_categorical_format='onehot')

print(processed_features.shape)
processed_features.head()


Encoding the 3 levels for protocol_type
Encoding the 64 levels for service
Encoding the 11 levels for flag
(148517, 116)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11
0,0.0,0.29453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,0.0,0.237128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
3,0.0,0.259014,0.429008,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,0.0,0.251757,0.287837,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [14]:
processed_target = pp.fit_target('class', target.values)
processed_target.head()

Unnamed: 0,class
0,4
1,4
2,0
3,4
4,4


In [15]:
processed_full_df = pd.concat([processed_features, processed_target], axis=1)
processed_full_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11,class
0,0.0,0.29453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,4
1,0.0,0.237128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,0
3,0.0,0.259014,0.429008,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,4
4,0.0,0.251757,0.287837,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,4


In [None]:
from featureSelection import PSOFeatureSelector

pso = PSOFeatureSelector()
selected_mask, selected_feature = pso.run_pso(df=processed_full_df, target_column='class', verbose=True)

selected_feature