In [31]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

RANDOM_SEED = 99
np.random.seed(RANDOM_SEED)

In [32]:
# !wget 'http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip' -O CIC_IDS_2017.zip
# !gdown '1X9s72a_9VzukVbFhGKHW1xLnyrnqu3MP' -O CIC_IDS_2017.zip

In [33]:
# !unzip "CIC_IDS_2017.zip"

### Carregando os dados

In [34]:
df_list = []
for file in os.listdir('MachineLearningCVE/'):
  df_aux = pd.read_csv(f'MachineLearningCVE/{file}')
  df_list.append(df_aux)
df = pd.concat(df_list, ignore_index=True)

In [35]:
# Removendo espaços em branco no nome das colunas
df.columns = df.columns.str.strip()

In [36]:
df.shape

(2830743, 79)

### Limpando os dados

In [37]:
# Descartando duplicadas
initial_shape = df.shape
df = df.drop_duplicates()
print(f'Shape inicial: {initial_shape}, tamanho final {df.shape} | Descartadas {initial_shape[0] - df.shape[0]} duplicadas')

Shape inicial: (2830743, 79), tamanho final (2522362, 79) | Descartadas 308381 duplicadas


In [38]:
# Descartando registros com valores NaN/Null/NA
initial_shape = df.shape
df = df.dropna()
print(f'Shape inicial: {initial_shape}, tamanho final {df.shape} | Descartadas {initial_shape[0] - df.shape[0]} registros com valores NA')

Shape inicial: (2522362, 79), tamanho final (2522009, 79) | Descartadas 353 registros com valores NA


In [39]:
df = df.reset_index(drop=True)

In [40]:
# Evitando registros com valores não finitos
max_finite_flow_packets_per_sec = df[np.isfinite(df['Flow Packets/s'])]['Flow Packets/s'].max()
max_finite_flow_bytes_per_sec = df[np.isfinite(df['Flow Bytes/s'])]['Flow Bytes/s'].max()

df.loc[df['Flow Packets/s'] == np.inf, 'Flow Packets/s'] = max_finite_flow_packets_per_sec
df.loc[df['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = max_finite_flow_bytes_per_sec

In [41]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [42]:
incidencias = df['Label'].value_counts()
print(incidencias)


Label
BENIGN                        2096134
DoS Hulk                       172846
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


### Dividindo dados nos conjuntos de treino, validação e teste


In [43]:
df_train = df.query('Label == "BENIGN"').sample(frac=0.6, random_state=RANDOM_SEED)
df_val_test = df.drop(df_train.index)

df_train = df_train.reset_index(drop=True)
df_val_test = df_val_test.reset_index(drop=True)

X_train = df_train.drop('Label', axis='columns')

In [44]:
df_val_test['Label'].value_counts()

Label
BENIGN                        838454
DoS Hulk                      172846
DDoS                          128016
PortScan                       90819
DoS GoldenEye                  10286
FTP-Patator                     5933
DoS slowloris                   5385
DoS Slowhttptest                5228
SSH-Patator                     3219
Bot                             1953
Web Attack � Brute Force        1470
Web Attack � XSS                 652
Infiltration                      36
Web Attack � Sql Injection        21
Heartbleed                        11
Name: count, dtype: int64

In [45]:
# X_val, X_test, classes_val, classes_test = train_test_split(df_val_test.drop('Label', axis='columns'), df_val_test['Label'], test_size=0.65, stratify=df_val_test['Label'], random_state=RANDOM_SEED)

# X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
# classes_val, classes_test =  classes_val.reset_index(drop=True), classes_test.reset_index(drop=True)

# y_val, y_test = classes_val.apply(lambda c: 1 if c == 'BENIGN' else 0), classes_test.apply(lambda c: 1 if c == 'BENIGN' else 0)

X_val = df_val_test.drop('Label', axis='columns')
y_val = df_val_test['Label'].apply(lambda c: 1 if c == 'BENIGN' else 0)

### Analisando correlação entre features

In [46]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)


In [47]:
corr_matrix = X_train.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

In [48]:
correlation_list[:10]

[(('SYN Flag Count', 'Fwd PSH Flags'), np.float64(1.0)),
 (('CWE Flag Count', 'Fwd URG Flags'), np.float64(1.0)),
 (('Avg Fwd Segment Size', 'Fwd Packet Length Mean'), np.float64(1.0)),
 (('Fwd Header Length.1', 'Fwd Header Length'), np.float64(1.0)),
 (('Subflow Fwd Packets', 'Total Fwd Packets'), np.float64(1.0)),
 (('Subflow Fwd Bytes', 'Total Length of Fwd Packets'), np.float64(1.0)),
 (('Subflow Bwd Packets', 'Total Backward Packets'), np.float64(1.0)),
 (('Avg Bwd Segment Size', 'Bwd Packet Length Mean'),
  np.float64(0.9999999999999994)),
 (('Subflow Bwd Bytes', 'Total Length of Bwd Packets'),
  np.float64(0.9999998390204298)),
 (('Total Backward Packets', 'Total Fwd Packets'),
  np.float64(0.9993317736445169))]

In [49]:
# Drop high correlated features in correlation list
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

In [50]:
f2drop

['Fwd PSH Flags',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Fwd Header Length',
 'Total Fwd Packets',
 'Total Length of Fwd Packets',
 'Total Backward Packets',
 'Bwd Packet Length Mean',
 'Total Length of Bwd Packets',
 'Subflow Fwd Packets',
 'Flow Duration',
 'Subflow Bwd Packets',
 'RST Flag Count',
 'Packet Length Mean',
 'Flow IAT Max',
 'Idle Mean',
 'Fwd IAT Total',
 'Max Packet Length',
 'Fwd Packet Length Max',
 'Bwd IAT Max',
 'Bwd IAT Mean',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Idle Max']

In [51]:
f2drop = f2drop + ['Destination Port']

In [52]:
X_train = X_train.drop(f2drop, axis='columns')
X_val = X_val.drop(f2drop, axis='columns')
# X_test = X_test.drop(f2drop, axis='columns')

In [53]:
X_train.sample(5)

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,...,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min
664419,0,0.0,0,0,0.0,0.0,62500.0,32.0,0.0,32,...,349,32832,0,32,0.0,0.0,0,0,0.0,0
603901,0,158.920765,1185,0,545.112832,1735.896909,6.192827,174933.5833,392899.5432,85,...,8192,1013,3,20,0.0,0.0,0,0,0.0,0
1228612,29,0.0,131,131,0.0,10162.92438,127.036555,10495.66667,18172.96576,3,...,-1,-1,1,32,0.0,0.0,0,0,0.0,0
353193,0,0.0,6,6,0.0,100558.659218,22346.368715,59.666667,64.66323,1,...,972,0,0,32,0.0,0.0,0,0,0.0,0
376919,0,274.573305,400,0,157.312004,10838.19704,71.176219,15454.6,24329.91155,1,...,8192,40,4,20,0.0,0.0,0,0,0.0,0


In [54]:
X_val.sample(5)

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,...,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min
686502,35,0.0,127,127,0.0,2063694.0,25477.70701,52.333333,87.185626,1,...,-1,-1,1,32,0.0,0.0,0,0,0.0,0
9149,0,10.263203,4380,0,2177.344966,11210.4,8.673052,129712.125,366457.4764,3,...,8192,229,2,20,0.0,0.0,0,0,0.0,0
911826,6,0.0,6,6,0.0,136363.6,22727.27273,88.0,0.0,88,...,63919,31270,0,20,0.0,0.0,0,0,0.0,0
737981,32,0.0,113,113,0.0,12155.25,167.658647,7952.666667,13770.95866,1,...,-1,-1,1,44,0.0,0.0,0,0,0.0,0
187407,0,0.0,0,0,0.0,0.0,61224.4898,24.5,33.234019,1,...,395,-1,0,32,0.0,0.0,0,0,0.0,0


In [55]:
minmax_scaler = MinMaxScaler()
minmax_scaler = minmax_scaler.fit(X_train)

norm_X_train = minmax_scaler.transform(X_train)
norm_X_val = minmax_scaler.transform(X_val)

X_val = pd.DataFrame(norm_X_val, columns=X_val.columns)
attack_data = pd.concat([X_val, y_val], axis=1)

attack_data.head()

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,...,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min,Label
0,0.002617,0.0,0.000307,0.002072,0.0,0.005872,0.339744,5.333333e-07,0.0,5.499999e-07,...,0.003922,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.002617,0.0,0.000307,0.002072,0.0,0.005898,0.34127,4.5e-07,0.0,4.666666e-07,...,0.003922,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.002617,0.002481,0.0,0.0,0.0,0.010202,0.416667,1.333333e-07,0.0,1.5e-07,...,0.0,5e-06,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.002617,0.002481,0.0,0.0,0.0,0.010202,0.416667,1.333333e-07,0.0,1.5e-07,...,0.0,5e-06,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.002617,0.002026,0.0,0.0,0.0,0.012626,0.5,1.125e-07,8.338498e-09,1.25e-07,...,0.0,1e-05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [57]:
norm_X_val = attack_data.to_numpy()

np.save("CICIDS_train.npy", norm_X_train)  # Salva os dados de treino normalizados em um arquivo .npy
np.save("CICIDS_attack.npy", norm_X_val)  # Salva os dados de validação normalizados em um arquivo .npy