In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [68]:
train = pd.read_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/raw/KDDTrain+.txt", header=None)
test = pd.read_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/raw/KDDTest+.TXT", header=None)

In [69]:
columns = [
 'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
 'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
 'root_shell','su_attempted','num_root','num_file_creations','num_shells',
 'num_access_files','num_outbound_cmds','is_host_login','is_guest_login',
 'count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
 'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
 'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',
 'dst_host_diff_srv_rate','dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate','dst_host_serror_rate',
 'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
 'label','difficulty'
]

train.columns = columns
test.columns = columns

In [70]:
train['binary_label'] = train['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')
test['binary_label'] = test['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')


In [71]:
attack_map = {
    'normal': 'normal',

    # DoS
    'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS',
    'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS',
    'apache2': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS',

    # Probe
    'satan': 'Probe', 'ipsweep': 'Probe', 'nmap': 'Probe',
    'portsweep': 'Probe', 'mscan': 'Probe', 'saint': 'Probe',

    # R2L
    'guess_passwd': 'R2L', 'ftp_write': 'R2L', 'imap': 'R2L', 'phf': 'R2L',
    'multihop': 'R2L', 'warezmaster': 'R2L', 'warezclient': 'R2L',
    'spy': 'R2L', 'xlock': 'R2L', 'xsnoop': 'R2L', 'snmpguess': 'R2L',
    'snmpgetattack': 'R2L', 'httptunnel': 'R2L', 'sendmail': 'R2L', 'named': 'R2L',

    # U2R
    'buffer_overflow': 'U2R', 'loadmodule': 'U2R', 'rootkit': 'U2R',
    'perl': 'U2R', 'sqlattack': 'U2R', 'xterm': 'U2R', 'ps': 'U2R'
}

train['attack_category'] = train['label'].map(attack_map)
test['attack_category'] = test['label'].map(attack_map)

In [72]:
X_train = train.drop(columns=['label','difficulty','binary_label','attack_category'])
X_test  = test.drop(columns=['label','difficulty','binary_label','attack_category'])

y_train_bin = train['binary_label']          # normal / attack
y_test_bin  = test['binary_label']

y_train_multi = train['attack_category']     # normal, DoS, Probe, R2L, U2R
y_test_multi  = test['attack_category']

In [73]:
from sklearn.preprocessing import OneHotEncoder
cat_features = ['protocol_type','service','flag']
num_features = [col for col in X_train.columns if col not in cat_features]

ohe = OneHotEncoder(handle_unknown='ignore' ,  sparse_output=False) 

X_train_cat = ohe.fit_transform(X_train[cat_features])
X_test_cat  = ohe.transform(X_test[cat_features])

X_train_num = X_train[num_features].values
X_test_num  = X_test[num_features].values


In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train_num)
X_test_num  = scaler.transform(X_test_num)


In [75]:
print(X_train_num.shape)
print(X_train_cat.shape)


(125973, 38)
(125973, 84)


In [76]:
X_train_num = np.asarray(X_train_num)
X_train_cat = np.asarray(X_train_cat)

X_test_num = np.asarray(X_test_num)
X_test_cat = np.asarray(X_test_cat)


In [77]:
print("Train num shape:", X_train_num.shape, "ndim:", X_train_num.ndim)
print("Train cat shape:", X_train_cat.shape, "ndim:", X_train_cat.ndim)


Train num shape: (125973, 38) ndim: 2
Train cat shape: (125973, 84) ndim: 2


In [78]:
X_train_final = np.hstack((X_train_num, X_train_cat))
X_test_final  = np.hstack((X_test_num, X_test_cat))

In [79]:
pd.DataFrame(X_train_final).to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/X_train.csv", index=False)
pd.DataFrame(X_test_final).to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/X_test.csv", index=False)

y_train_bin.to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/y_train_binary.csv", index=False)
y_test_bin.to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/y_test_binary.csv", index=False)

y_train_multi.to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/y_train_multiclass.csv", index=False)
y_test_multi.to_csv("C:/Users/Shantanu Ojha/Desktop/nsl-kdd-classical-ids-reproduction/data/preprocessed/y_test_multiclass.csv", index=False)

Categorical features (protocol, service, flag) were one-hot encoded.

Numeric features were standardized using statistics from the training set only to avoid data leakage.

Both binary and multiclass labels were prepared.