In [2]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from sklearn.model_selection import train_test_split 

warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/MachineLearningCVE.csv")

In [4]:
data.columns = [x.strip() for x in data.columns]

### Remove duplicated , missing values , and unusefull feature

In [5]:
# removing missing values
data.dropna(inplace = True)

# removing duplicated instances
data.drop_duplicates(inplace = True)

# remove duplicated feature
data.drop(['Fwd Header Length.1'],axis=1, inplace = True)

# remove non effective feature 
data.drop(['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 
'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'],axis=1, inplace = True)

# remove highly correlated feature
data.drop(['Bwd IAT Std', 'SYN Flag Count', 'Fwd Packet Length Max', 'Fwd Packet Length Std', 
'Subflow Bwd Packets', 'Subflow Fwd Packets', 'Total Backward Packets', 'Total Fwd Packets',
'Total Length of Bwd Packets','act_data_pkt_fwd', 'RST Flag Count', 'Fwd URG Flags', 'Idle Max'], axis = 1, inplace = True)

# remove infinity or a value too large for dtype('float64').
data = data[np.isfinite(data[data.columns[0:-1]]).all(1)]

# the new shape
data.shape

(2520798, 57)

In [6]:
data['Label'].value_counts()

BENIGN                        2095057
DoS Hulk                       172846
DDoS                           128014
PortScan                        90694
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1948
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: Label, dtype: int64

### Standard scaling (z-Score Scaling)

In [7]:
X = data[data.columns[0:-1]]
cols = list(X.columns)
for col in cols:
    X[col] = stats.zscore(X[col])
data[data.columns[0:-1]] = X

**Saving prepared data in a CSV file**

In [8]:
data.to_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/preparedData.csv",index = False)

### Split dataset to training data and testing data

In [9]:
class0 = data[data['Label'] == 'BENIGN']
class0_X = class0[class0.columns[0:-1]]
class0_Y = class0[class0.columns[-1]]
class0_X_train, class0_X_test, class0_Y_train, class0_Y_test = train_test_split(class0_X, class0_Y, test_size = 0.2, random_state = 10)

In [10]:
class1 = data[data['Label'] == 'Bot']
class1_X = class1[class1.columns[0:-1]]
class1_Y = class1[class1.columns[-1]]
class1_X_train, class1_X_test, class1_Y_train, class1_Y_test = train_test_split(class1_X, class1_Y, test_size = 0.2, random_state = 10)

In [11]:
class2 = data[data['Label'] == 'DDoS']
class2_X = class2[class2.columns[0:-1]]
class2_Y = class2[class2.columns[-1]]
class2_X_train, class2_X_test, class2_Y_train, class2_Y_test = train_test_split(class2_X, class2_Y, test_size = 0.2, random_state = 10)

In [12]:
class3 = data[data['Label'] == 'DoS GoldenEye']
class3_X = class3[class3.columns[0:-1]]
class3_Y = class3[class3.columns[-1]]
class3_X_train, class3_X_test, class3_Y_train, class3_Y_test = train_test_split(class3_X, class3_Y, test_size = 0.2, random_state = 10)

In [13]:
class4 = data[data['Label'] == 'DoS Hulk']
class4_X = class4[class4.columns[0:-1]]
class4_Y = class4[class4.columns[-1]]
class4_X_train, class4_X_test, class4_Y_train, class4_Y_test = train_test_split(class4_X, class4_Y, test_size = 0.2, random_state = 10)

In [14]:
class5 = data[data['Label'] == 'DoS slowloris']
class5_X = class5[class5.columns[0:-1]]
class5_Y = class5[class5.columns[-1]]
class5_X_train, class5_X_test, class5_Y_train, class5_Y_test = train_test_split(class5_X, class5_Y, test_size = 0.2, random_state = 10)

In [15]:
class6 = data[data['Label'] == 'DoS Slowhttptest']
class6_X = class6[class6.columns[0:-1]]
class6_Y = class6[class6.columns[-1]]
class6_X_train, class6_X_test, class6_Y_train, class6_Y_test = train_test_split(class6_X, class6_Y, test_size = 0.2, random_state = 10)

In [16]:
class7 = data[data['Label'] == 'FTP-Patator']
class7_X = class7[class7.columns[0:-1]]
class7_Y = class7[class7.columns[-1]]
class7_X_train, class7_X_test, class7_Y_train, class7_Y_test = train_test_split(class7_X, class7_Y, test_size = 0.2, random_state = 10)

In [17]:
class8 = data[data['Label'] == 'Heartbleed']
class8_X = class8[class8.columns[0:-1]]
class8_Y = class8[class8.columns[-1]]
class8_X_train, class8_X_test, class8_Y_train, class8_Y_test = train_test_split(class8_X, class8_Y, test_size = 0.2, random_state = 10)

In [18]:
class9 = data[data['Label'] == 'Infiltration']
class9_X = class9[class9.columns[0:-1]]
class9_Y = class9[class9.columns[-1]]
class9_X_train, class9_X_test, class9_Y_train, class9_Y_test = train_test_split(class9_X, class9_Y, test_size = 0.2, random_state = 10)

In [19]:
class10 = data[data['Label'] == 'PortScan']
class10_X = class10[class10.columns[0:-1]]
class10_Y = class10[class10.columns[-1]]
class10_X_train, class10_X_test, class10_Y_train, class10_Y_test = train_test_split(class10_X, class10_Y, test_size = 0.2, random_state = 10)

In [20]:
class11 = data[data['Label'] == 'SSH-Patator']
class11_X = class11[class11.columns[0:-1]]
class11_Y = class11[class11.columns[-1]]
class11_X_train, class11_X_test, class11_Y_train, class11_Y_test = train_test_split(class11_X, class11_Y, test_size = 0.2, random_state = 10)

In [21]:
class12 = data[data['Label'] == 'Web Attack � Brute Force']
class12_X = class12[class12.columns[0:-1]]
class12_Y = class12[class12.columns[-1]]
class12_X_train, class12_X_test, class12_Y_train, class12_Y_test = train_test_split(class12_X, class12_Y, test_size = 0.2, random_state = 10)

In [22]:
class13 = data[data['Label'] == 'Web Attack � Sql Injection']
class13_X = class13[class13.columns[0:-1]]
class13_Y = class13[class13.columns[-1]]
class13_X_train, class13_X_test, class13_Y_train, class13_Y_test = train_test_split(class13_X, class13_Y, test_size = 0.2, random_state = 10)

In [23]:
class14 = data[data['Label'] == 'Web Attack � XSS']
class14_X = class14[class14.columns[0:-1]]
class14_Y = class14[class14.columns[-1]]
class14_X_train, class14_X_test, class14_Y_train, class14_Y_test = train_test_split(class14_X, class14_Y, test_size = 0.2, random_state = 10)

In [24]:
# concatenate data
# training data
X_train =  pd.concat([class0_X_train, class1_X_train, class2_X_train, class3_X_train, class4_X_train, class5_X_train, class6_X_train, class7_X_train,
            class8_X_train, class9_X_train, class10_X_train, class11_X_train, class12_X_train, class13_X_train, class14_X_train])
Y_train =  pd.concat([class0_Y_train, class1_Y_train, class2_Y_train, class3_Y_train, class4_Y_train, class5_Y_train, class6_Y_train, class7_Y_train,
            class8_Y_train, class9_Y_train, class10_Y_train, class11_Y_train, class12_Y_train, class13_Y_train, class14_Y_train])
# testing data
X_test =  pd.concat([class0_X_test, class1_X_test, class2_X_test, class3_X_test, class4_X_test, class5_X_test, class6_X_test, class7_X_test,
            class8_X_test, class9_X_test, class10_X_test, class11_X_test, class12_X_test, class13_X_test, class14_X_test])
Y_test =  pd.concat([class0_Y_test, class1_Y_test, class2_Y_test, class3_Y_test, class4_Y_test, class5_Y_test, class6_Y_test, class7_Y_test,
            class8_Y_test, class9_Y_test, class10_Y_test, class11_Y_test, class12_Y_test, class13_Y_test, class14_Y_test])

In [25]:
Y_train.value_counts()

BENIGN                        1676045
DoS Hulk                       138276
DDoS                           102411
PortScan                        72555
DoS GoldenEye                    8228
FTP-Patator                      4744
DoS slowloris                    4308
DoS Slowhttptest                 4182
SSH-Patator                      2575
Bot                              1558
Web Attack � Brute Force         1176
Web Attack � XSS                  521
Infiltration                       28
Web Attack � Sql Injection         16
Heartbleed                          8
Name: Label, dtype: int64

In [26]:
Y_test.value_counts()

BENIGN                        419012
DoS Hulk                       34570
DDoS                           25603
PortScan                       18139
DoS GoldenEye                   2058
FTP-Patator                     1187
DoS slowloris                   1077
DoS Slowhttptest                1046
SSH-Patator                      644
Bot                              390
Web Attack � Brute Force         294
Web Attack � XSS                 131
Infiltration                       8
Web Attack � Sql Injection         5
Heartbleed                         3
Name: Label, dtype: int64

In [27]:
# saving (training and testing data) in CSV files
X_train.to_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/X_train.csv", index = False)
Y_train.to_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/Y_train.csv", index = False)

X_test.to_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/X_test.csv", index = False)
Y_test.to_csv("/Users/Safa BENABDESSADOK/Desktop/M1 GL/SAD/SAD project/Dataset/CIC-IDS2017/Y_test.csv", index = False)