In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
train = pd.read_csv("../data/raw/KDDTrain+.txt", header=None)
test = pd.read_csv("../data/raw/KDDTest+.TXT", header=None)

In [3]:
columns = [
 'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
 'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
 'root_shell','su_attempted','num_root','num_file_creations','num_shells',
 'num_access_files','num_outbound_cmds','is_host_login','is_guest_login',
 'count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
 'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
 'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',
 'dst_host_diff_srv_rate','dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate','dst_host_serror_rate',
 'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate',
 'label','difficulty'
]

train.columns = columns
test.columns = columns

In [4]:
train['binary_label'] = train['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')
test['binary_label'] = test['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

In [5]:
attack_map = {
    "normal": "normal",

    # DoS
    "back": "dos", "land": "dos", "neptune": "dos", "pod": "dos",
    "smurf": "dos", "teardrop": "dos", "mailbomb": "dos",
    "apache2": "dos", "processtable": "dos", "udpstorm": "dos",

    # Probe
    "satan": "probe", "ipsweep": "probe", "nmap": "probe",
    "portsweep": "probe", "mscan": "probe", "saint": "probe",

    # R2L
    "guess_passwd": "r2l", "ftp_write": "r2l", "imap": "r2l",
    "phf": "r2l", "multihop": "r2l", "warezmaster": "r2l",
    "warezclient": "r2l", "spy": "r2l", "xlock": "r2l",
    "xsnoop": "r2l", "snmpguess": "r2l", "snmpgetattack": "r2l",
    "httptunnel": "r2l", "sendmail": "r2l", "named": "r2l",
    "worm": "r2l",

    # U2R
    "buffer_overflow": "u2r", "loadmodule": "u2r",
    "rootkit": "u2r", "perl": "u2r",
    "sqlattack": "u2r", "xterm": "u2r", "ps": "u2r"
}

train['attack_category'] = train['label'].map(attack_map)
test['attack_category'] = test['label'].map(attack_map)

In [6]:
X_train = train.drop(columns=['label','difficulty','binary_label','attack_category'])
X_test  = test.drop(columns=['label','difficulty','binary_label','attack_category'])

y_train_bin = train['binary_label']          # normal / attack
y_test_bin  = test['binary_label']

y_train_multi = train['attack_category']     # normal, DoS, Probe, R2L, U2R
y_test_multi  = test['attack_category']

In [7]:
from sklearn.preprocessing import OneHotEncoder
cat_features = ['protocol_type','service','flag']
num_features = [col for col in X_train.columns if col not in cat_features]

ohe = OneHotEncoder(handle_unknown='ignore' ,  sparse_output=False) 

X_train_cat = ohe.fit_transform(X_train[cat_features])
X_test_cat  = ohe.transform(X_test[cat_features])

X_train_num = X_train[num_features].values
X_test_num  = X_test[num_features].values

In [8]:
import numpy as np

X_train_oh = np.hstack([X_train_num, X_train_cat])
X_test_oh  = np.hstack([X_test_num, X_test_cat])

In [9]:
from sklearn.preprocessing import Normalizer

l2_norm = Normalizer(norm='l2')

X_train_l2 = l2_norm.fit_transform(X_train_oh)
X_test_l2  = l2_norm.transform(X_test_oh)


In [10]:
pd.DataFrame(X_train_l2).to_csv("../data/processed/X_train_l2.csv", index=False)
pd.DataFrame(X_test_l2).to_csv("../data/processed/X_test_l2.csv", index=False)