In [None]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [None]:
###Preprocessing
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

df_train = pd.read_csv('https://raw.githubusercontent.com/Tdjaaleb/IDS_AdversarialML/main/Data/KDDTrain%2B.txt',header=None, names=col_names)
df_test = pd.read_csv('https://raw.githubusercontent.com/Tdjaaleb/IDS_AdversarialML/main/Data/KDDTest%2B.txt', header=None, names=col_names)

#Suppresion de la variable "difficulty_level"
df_train.drop(['difficulty_level'],axis=1,inplace=True)
df_test.drop(['difficulty_level'],axis=1,inplace=True)

#Recodage de la variable "label"
def change_label(df):
  df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
  df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
  df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
  df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)

change_label(df_train)
change_label(df_test)

In [None]:
###Normalization
numeric_col_train = df_train.select_dtypes(include='number').columns
numeric_col_test = df_test.select_dtypes(include='number').columns

std_scaler = StandardScaler()
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
  return df

df_train = normalization(df_train.copy(),numeric_col_train)
df_test = normalization(df_test.copy(),numeric_col_test)

In [None]:
###One Hot Encoding
cat_col = ['protocol_type','service','flag']

categorical_train = df_train[cat_col]
categorical_test = df_test[cat_col]


categorical_train = pd.get_dummies(categorical_train,columns=cat_col)
categorical_test = pd.get_dummies(categorical_test,columns=cat_col)

In [None]:
#Multiclass Classification
multi_df_train = df_train.copy()
multi_df_test = df_test.copy()

multi_label_train = pd.DataFrame(multi_df_train.label)
multi_label_test = pd.DataFrame(multi_df_test.label)

le2_train = preprocessing.LabelEncoder()
le2_test = preprocessing.LabelEncoder()

enc_label_train = multi_label_train.apply(le2_train.fit_transform)
enc_label_test = multi_label_test.apply(le2_test.fit_transform)

multi_df_train['intrusion'] = enc_label_train
multi_df_test['intrusion'] = enc_label_test

multi_df_train = pd.get_dummies(multi_df_train,columns=['label'],prefix="",prefix_sep="")
multi_df_test = pd.get_dummies(multi_df_test,columns=['label'],prefix="",prefix_sep="") 

multi_df_train['label'] = multi_label_train
multi_df_test['label'] = multi_label_test


plt.figure(figsize=(8,8))
plt.pie(multi_df_train.label.value_counts(),labels=multi_df_train.label.unique(),autopct='%0.2f%%')
plt.title('Pie chart distribution of multi-class labels (Training)')
plt.legend()
plt.show()

plt.figure(figsize=(8,8))
plt.pie(multi_df_test.label.value_counts(),labels=multi_df_test.label.unique(),autopct='%0.2f%%')
plt.title('Pie chart distribution of multi-class labels (Testing)')
plt.legend()
plt.show()

In [None]:
#Feature Extraction
numeric_multi_train = multi_df_train[numeric_col_train]
numeric_multi_test = multi_df_test[numeric_col_test]

numeric_multi_train['intrusion'] = multi_df_train['intrusion']
numeric_multi_test['intrusion'] = multi_df_test['intrusion']

corr = numeric_multi_train.corr()
corr_y = abs(corr['intrusion'])
highest_corr = corr_y[corr_y >0.5]
highest_corr.sort_values(ascending=True)


numeric_multi_train = multi_df_train[['count','logged_in','srv_serror_rate','serror_rate','dst_host_serror_rate',
                        'dst_host_same_srv_rate','dst_host_srv_serror_rate','dst_host_srv_count','same_srv_rate']]
numeric_multi_test = multi_df_test[['count','logged_in','srv_serror_rate','serror_rate','dst_host_serror_rate',
                        'dst_host_same_srv_rate','dst_host_srv_serror_rate','dst_host_srv_count','same_srv_rate']]

numeric_multi_train = numeric_multi_train.join(categorical_train)
numeric_multi_test = numeric_multi_test.join(categorical_test)

multi_df_train = numeric_multi_train.join(multi_df_train[['intrusion','Dos','Probe','R2L','U2R','normal','label']])
multi_df_test = numeric_multi_test.join(multi_df_test[['intrusion','Dos','Probe','R2L','U2R','normal','label']])

In [27]:
number_of_chunks = 2

for id, df_i in  enumerate(np.array_split(multi_df_train, number_of_chunks)):
    df_i.to_csv(f'Data/multi_data_train_{id}.csv', index=False)

multi_df_test.to_csv("Data/multi_data_test.csv", index=False)