In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [2]:
def preprocessing(df):
  print('=============================== Data frame ===============================')
  # Loai bo mot so cot
  if 'Timestamp' in df.columns:
    df.drop(['Timestamp'], axis=1,inplace=True)
  if 'Flow ID' in df.columns:
    df.drop(['Flow ID'], axis=1,inplace=True)
  if 'Src IP' in df.columns:
    df.drop(['Src IP'], axis=1,inplace=True)
  if 'Src Port' in df.columns:
    df.drop(['Src Port'], axis=1,inplace=True)
  if 'Dst IP' in df.columns:
    df.drop(['Dst IP'], axis=1,inplace=True)

  # Convert data type ve dang numberic
  for col in df.columns:
    if df[col].dtype == 'object' and col != 'Label':
      df[col] = pd.to_numeric(df[col], errors='coerce')

  # Thay the inf = NAN
  df.replace([np.inf, -np.inf], np.nan, inplace=True)

  # So luong diem du lieu o moi label
  print('Counting before: \n', df.iloc[:,-1].value_counts())

  print('--------------------------------------------------')

  # Dem so luong NAN moi column
  countNA = df.isna().sum()
  print(countNA)
  print(len(df))

  print('--------------------------------------------------')
      
  # Xoa cac row chua NAN
  df.dropna(inplace=True)

  # So luong diem du lieu (da xu ly) o moi label
  print('Counting after: \n', df.iloc[:,-1].value_counts())
  print('--------------------------------------------------')
  df.info()

  # Data normalization
  scaler = MinMaxScaler()

  columns_to_normalize = df.columns[:-1]  # Loại bỏ cột cuối cùng (label)

  # Normalize các cột đã chọn trong DataFrame
  df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

  return df


In [3]:
# Đường dẫn đến folder chứa các file CSV
folder_path = '/home/haohao/Downloads/cse-cic-ids2018'

# Đọc tất cả các file CSV và gộp chúng thành một DataFrame
dataframes = []
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        filepath = os.path.join(folder_path, filename)
        df = preprocessing(pd.read_csv(filepath))
        dataframes.append(df)


# Gộp tất cả DataFrames vào một DataFrame duy nhất
full_data = pd.concat(dataframes, ignore_index=True)

Counting before: 
 Benign              1048009
Brute Force -Web        362
Brute Force -XSS        151
SQL Injection            53
Name: Label, dtype: int64
--------------------------------------------------
Dst Port         0
Protocol         0
Flow Duration    0
Tot Fwd Pkts     0
Tot Bwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 79, dtype: int64
1048575
--------------------------------------------------
Counting after: 
 Benign              1042301
Brute Force -Web        362
Brute Force -XSS        151
SQL Injection            53
Name: Label, dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1042867 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1042867 non-null  int64  
 1   Protocol           10428

  df = preprocessing(pd.read_csv(filepath))


Counting before: 
 DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Label                            1
Name: Label, dtype: int64
--------------------------------------------------
Dst Port         1
Protocol         1
Flow Duration    1
Tot Fwd Pkts     1
Tot Bwd Pkts     1
                ..
Idle Mean        1
Idle Std         1
Idle Max         1
Idle Min         1
Label            0
Length: 79, dtype: int64
1048575
--------------------------------------------------
Counting after: 
 DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Name: Label, dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1048574 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048574 non-null  float64
 1   Pro

  df = preprocessing(pd.read_csv(filepath))


Counting before: 
 Benign           544200
Infilteration     68871
Label                33
Name: Label, dtype: int64
--------------------------------------------------
Dst Port         33
Protocol         33
Flow Duration    33
Tot Fwd Pkts     33
Tot Bwd Pkts     33
                 ..
Idle Mean        33
Idle Std         33
Idle Max         33
Idle Min         33
Label             0
Length: 79, dtype: int64
613104
--------------------------------------------------
Counting after: 
 Benign           538666
Infilteration     68236
Name: Label, dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 606902 entries, 0 to 613103
Data columns (total 79 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           606902 non-null  float64
 1   Protocol           606902 non-null  float64
 2   Flow Duration      606902 non-null  float64
 3   Tot Fwd Pkts       606902

  df = preprocessing(pd.read_csv(filepath))


Counting before: 
 Benign           238037
Infilteration     93063
Label                25
Name: Label, dtype: int64
--------------------------------------------------
Dst Port         25
Protocol         25
Flow Duration    25
Tot Fwd Pkts     25
Tot Bwd Pkts     25
                 ..
Idle Mean        25
Idle Std         25
Idle Max         25
Idle Min         25
Label             0
Length: 79, dtype: int64
331125
--------------------------------------------------
Counting after: 
 Benign           235778
Infilteration     92403
Name: Label, dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 328181 entries, 0 to 331111
Data columns (total 79 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           328181 non-null  float64
 1   Protocol           328181 non-null  float64
 2   Flow Duration      328181 non-null  float64
 3   Tot Fwd Pkts       328181

In [4]:
# Save file
full_data.to_csv('pre-processed.csv', index=False)

In [5]:
print('Final: \n', full_data.iloc[:,-1].value_counts())
full_data.info()

Final: 
 Benign                      13390249
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193354
SSH-Bruteforce                187589
Infilteration                 160639
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Name: Label, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16137183 entries, 0 to 16137182
Data columns (total 79 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           float64
 1   Protocol           float64
 2   Flow Duration      float64
 3   Tot Fwd Pkts       float64
 4   Tot Bwd Pkts       float64
 5   TotLen Fwd Pkts    float64
 6   TotLen Bwd Pkts    float64
 7   Fw