In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

In [None]:
# !wget 'http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip' -O CIC_IDS_2017.zip
# !gdown '1X9s72a_9VzukVbFhGKHW1xLnyrnqu3MP' -O CIC_IDS_2017.zip

In [3]:
# !unzip "CIC_IDS_2017.zip"

### Carregando os dados

In [4]:
df_list = []
for file in os.listdir('MachineLearningCVE/'):
  df_aux = pd.read_csv(f'MachineLearningCVE/{file}')
  df_list.append(df_aux)
df = pd.concat(df_list, ignore_index=True)

In [5]:
# Removendo espaços em branco no nome das colunas
df.columns = df.columns.str.strip()

In [6]:
df.shape

(2830743, 79)

### Limpando os dados

In [7]:
# Descartando duplicadas
initial_len = df.shape[0]
df = df.drop_duplicates()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartadas {initial_len - df.shape[0]} duplicadas')

Tamanho inicial: 2830743, tamanho final 2522362 | Descartadas 308381 duplicadas


In [8]:
# Descartando registros com valores NaN/Null/NA
initial_len = df.shape[0]
df = df.dropna()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartados {initial_len - df.shape[0]} registros com valores NA')

Tamanho inicial: 2522362, tamanho final 2522009 | Descartados 353 registros com valores NA


In [9]:
df = df.reset_index(drop=True)

In [10]:
# Evitando registros com valores não finitos
max_finite_flow_packets_per_sec = df[np.isfinite(df['Flow Packets/s'])]['Flow Packets/s'].max()
max_finite_flow_bytes_per_sec = df[np.isfinite(df['Flow Bytes/s'])]['Flow Bytes/s'].max()

df.loc[df['Flow Packets/s'] == np.inf, 'Flow Packets/s'] = max_finite_flow_packets_per_sec
df.loc[df['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = max_finite_flow_bytes_per_sec

In [11]:
df

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.000000,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.000000,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522004,56669,53,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2522005,53,445,2,2,94,350,47,47,47.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2522006,59317,110,1,1,0,0,0,0,0.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2522007,54726,81,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [29]:
incidencias = df['Label'].value_counts()
print(incidencias)


BENIGN                        2096134
DoS Hulk                       172846
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: Label, dtype: int64


### Dividindo dados nos conjuntos de treino, validação e teste


In [13]:
df_train = df.query('Label == "BENIGN"').sample(frac=0.6, random_state=RANDOM_SEED)
df_val_test = df.drop(df_train.index)

df_train = df_train.reset_index(drop=True)
df_val_test = df_val_test.reset_index(drop=True)

X_train = df_train.drop('Label', axis='columns')

In [14]:
X_val, X_test, classes_val, classes_test = train_test_split(df_val_test.drop('Label', axis='columns'), df_val_test['Label'], test_size=0.65, stratify=df_val_test['Label'], random_state=RANDOM_SEED)

X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
classes_val, classes_test =  classes_val.reset_index(drop=True), classes_test.reset_index(drop=True)

y_val, y_test = classes_val.apply(lambda c: 0 if c == 'BENIGN' else 1), classes_test.apply(lambda c: 0 if c == 'BENIGN' else 1)

### Analisando correlação entre features

In [15]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)


In [16]:
corr_matrix = X_train.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

In [17]:
correlation_list[:10]


[(('SYN Flag Count', 'Fwd PSH Flags'), 1.0),
 (('CWE Flag Count', 'Fwd URG Flags'), 1.0),
 (('Avg Fwd Segment Size', 'Fwd Packet Length Mean'), 1.0),
 (('Fwd Header Length.1', 'Fwd Header Length'), 1.0),
 (('Subflow Fwd Packets', 'Total Fwd Packets'), 1.0),
 (('Subflow Fwd Bytes', 'Total Length of Fwd Packets'), 1.0),
 (('Subflow Bwd Packets', 'Total Backward Packets'), 1.0),
 (('Avg Bwd Segment Size', 'Bwd Packet Length Mean'), 0.999999999999999),
 (('Subflow Bwd Bytes', 'Total Length of Bwd Packets'), 0.9999998547949555),
 (('Total Backward Packets', 'Total Fwd Packets'), 0.9992593824882177)]

In [18]:
# Drop high correlated features in correlation list
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

In [19]:
f2drop

['Fwd PSH Flags',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Fwd Header Length',
 'Total Fwd Packets',
 'Total Length of Fwd Packets',
 'Total Backward Packets',
 'Bwd Packet Length Mean',
 'Total Length of Bwd Packets',
 'Subflow Fwd Packets',
 'Flow Duration',
 'Subflow Bwd Packets',
 'RST Flag Count',
 'Packet Length Mean',
 'Flow IAT Max',
 'Idle Mean',
 'Fwd IAT Total',
 'Fwd Packet Length Max',
 'Max Packet Length',
 'Bwd IAT Max',
 'Bwd IAT Mean',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Idle Max']

In [30]:
f2drop = f2drop + ['Destination Port']

In [31]:
X_train = X_train.drop(f2drop, axis='columns')
X_val = X_val.drop(f2drop, axis='columns')
X_test = X_test.drop(f2drop, axis='columns')

In [32]:
X_train.sample(5)

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,...,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min
664419,37,0.0,0,0,0.0,36.96198,0.998973,2002057.0,0.0,2002057,...,32850,-1,1,20,0.0,0.0,0,0,0.0,0
603901,2,0.0,6,6,0.0,155339.8,38834.95146,34.33333,24.54248,6,...,1024,0,1,24,0.0,0.0,0,0,0.0,0
1228612,0,0.0,0,0,0.0,0.0,24390.2439,82.0,0.0,82,...,972,284,0,32,0.0,0.0,0,0,0.0,0
353193,34,0.0,50,50,0.0,1090909.0,25974.025974,51.33333,50.08326,3,...,-1,-1,1,20,0.0,0.0,0,0,0.0,0
376919,0,141.551773,1460,0,490.887903,95.15637,0.53627,1921240.0,3859869.0,48,...,8192,127,10,20,556321.5,1238850.259,3085114,50508,81593.63474,9800546


### Exportando dados em npy

In [33]:
X_train

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,...,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min
0,6,20.124612,46,46,0.000000,4.041652e+01,1.339774,8.956734e+05,1.870427e+06,4,...,65182,469,4,20,0.000000e+00,0.00000,0,0,0.00000,0
1,0,13.279056,110,0,55.000000,3.566936e+02,18.773349,6.214483e+04,8.492770e+04,1,...,65535,227,1,32,0.000000e+00,0.00000,0,0,0.00000,0
2,0,0.000000,1448,0,726.630810,2.550000e+07,49382.716050,2.700000e+01,4.330127e+01,2,...,395,229,0,32,0.000000e+00,0.00000,0,0,0.00000,0
3,2,0.000000,0,0,0.000000,8.163265e+04,40816.326530,4.900000e+01,0.000000e+00,49,...,1024,-1,1,24,0.000000e+00,0.00000,0,0,0.00000,0
4,6,0.000000,6,6,0.000000,4.000000e+06,666666.666667,3.000000e+00,0.000000e+00,3,...,913,16425,0,20,0.000000e+00,0.00000,0,0,0.00000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257675,0,71.698753,1706,0,826.831785,9.686983e+02,2.865406,3.739186e+05,1.268489e+06,18,...,29200,259,3,32,0.000000e+00,0.00000,0,0,0.00000,0
1257676,48,0.000000,48,48,0.000000,4.329200e+03,90.191657,2.217500e+04,0.000000e+00,22175,...,-1,-1,0,32,0.000000e+00,0.00000,0,0,0.00000,0
1257677,39,0.000000,71,71,0.000000,4.701056e+03,85.473738,2.339900e+04,0.000000e+00,23399,...,-1,-1,0,32,0.000000e+00,0.00000,0,0,0.00000,0
1257678,0,628.973313,1988,2,857.386100,1.378520e+03,2.171354,4.764230e+05,1.854548e+06,3,...,8192,42930,14,20,3.817839e+06,0.00000,3817839,3817839,0.00000,9992336


In [34]:
np.save("CICIDS_train.npy", X_train)  # Salva os dados de treino normalizados em um arquivo .npy
np.save("CICIDS_val.npy", X_val)  # Salva os dados de validação normalizados em um arquivo .npy
np.save("CICIDS_test.npy", X_test)  # Salva os dados de teste normalizados em um arquivo .npy
