In [1]:
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from collections import Counter
from glob import glob
from tqdm import tqdm
import numpy as np

In [2]:
def process_data(df, is_drop=True):
    df = df.drop(columns=[
        "Flow ID",
        "Src IP",
    ])

    if is_drop:
        df = df.drop(columns=[
            "Dst IP",
            "Timestamp"
        ])
    else:
        # Timestamp
        import datetime
        # 2017-07-07 11:59:50.315195 -> 1500000000.315195
        DATEFORMAT = "%Y-%m-%d %H:%M:%S.%f"
        df["continuous_timestamp"] = df["Timestamp"].apply(
            lambda x: datetime.strptime(x, DATEFORMAT).timestamp()
        )
        df = df.drop(columns=[
            "Timestamp",
        ])
        df = df.rename(columns={
            "continuous_timestamp": "Timestamp"
        })
        # Dst IP
        import ipaddress as ip
        df["destination_ip"] = df["Dst IP"].apply(
            lambda x: int(ip.IPv4Address(x))
        )
        df = df.drop(columns=[
            "Dst IP",
        ])
        df = df.rename(columns={
            "destination_ip": "Dst IP"
        })
    return df
    

In [3]:
def read_data(pattern=0):
    if pattern == 0:
        file_path = "../data_cicids2017/0_raw/friday.csv"
        df = pd.read_csv(file_path)
        df = process_data(df)
    else:
        directory_path = "../data_cicids2017/0_raw"

        df = pd.DataFrame()
        files_path = glob(f"{directory_path}/*.csv")
        print(files_path)
        for file_path in tqdm(files_path):
            df_tmp = pd.read_csv(file_path)
            df_tmp = process_data(df_tmp)
            df_tmp = df_tmp.replace([np.inf, -np.inf], np.nan)
            df_tmp = df_tmp.dropna()
            df = pd.concat([df, df_tmp], axis=0)
    return df

In [4]:
df = read_data()
X = df.drop(columns=["Label", "Attempted Category"])
y = df["Label"]

print("Before: ", Counter(y))
df = df.drop(columns=[
    'id', 'Src Port',
    'Bwd PSH Flags',
    'Fwd URG Flags', 'Bwd URG Flags', 'Fwd RST Flags', 'Bwd RST Flags',
    'Packet Length Min', 'Packet Length Max',
    'FIN Flag Count',
    'RST Flag Count', 'PSH Flag Count',
    'URG Flag Count', 'CWR Flag Count', 'ECE Flag Count',
    'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg',
    'ICMP Code', 'ICMP Type',
    'Total TCP Flow Time',
    'Attempted Category'
])
result_counter = Counter(y)
third_most_common = result_counter.most_common(3)[-1]
print(f"3番目に多い属性: {third_most_common}")
if third_most_common[1] > 100_000:
    more_than_third_label = [k for k, v in result_counter.items() if v >= 100_000]
    less_than_third_label = [k for k, v in result_counter.items() if v <= 100_000]
else:
    more_than_third_label = [k for k, v in result_counter.items() if v >= third_most_common[1]]
    less_than_third_label = [k for k, v in result_counter.items() if v <= third_most_common[1]]

more_than_third = df[df["Label"].isin(more_than_third_label)]
less_than_third = df[df["Label"].isin(less_than_third_label)]

print(more_than_third["Label"].value_counts())
print(less_than_third["Label"].value_counts())

Before:  Counter({'BENIGN': 288544, 'Portscan': 159066, 'DDoS': 95144, 'Botnet - Attempted': 4067, 'Botnet': 736})
3番目に多い属性: ('DDoS', 95144)
Label
BENIGN      288544
Portscan    159066
DDoS         95144
Name: count, dtype: int64
Label
DDoS                  95144
Botnet - Attempted     4067
Botnet                  736
Name: count, dtype: int64


In [5]:
# undersampling ... more than thrid
nm = NearMiss(
    n_neighbors_ver3=2,
    version=3,
    n_jobs=-1,
    sampling_strategy={
        k: third_most_common[1] for k in more_than_third["Label"].unique()
    }
)

under_x = more_than_third.drop(columns=["Label"])
under_y = more_than_third["Label"]
print("undersampling start...")
under_x_res, under_y_res = nm.fit_resample(under_x, under_y)
print("undersampling end...")

under_res = pd.concat([under_x_res, under_y_res], axis=1)
under_res = under_res[under_res["Label"] != third_most_common[0]]

if third_most_common[0] in less_than_third["Label"].unique():
    less_than_third = less_than_third[less_than_third["Label"] != third_most_common[0]]

df = pd.concat([under_res, less_than_third], axis=0)

smote_enn = SMOTEENN(
    random_state=42,
    n_jobs=-1,
    smote=SMOTE(
        k_neighbors=2,
        random_state=42,
    )
)

over_x = df.drop(columns=["Label"])
over_y = df["Label"]
print("oversampling start...")
over_x_res, over_y_res = smote_enn.fit_resample(over_x, over_y)
print("oversampling end...")

df = pd.concat([over_x_res, over_y_res], axis=1)

undersampling start...




undersampling end...
oversampling start...
oversampling end...


In [6]:
print(df.columns)

Index(['Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet',
       'Total Bwd packets', 'Total Length of Fwd Packet',
       'Total Length of Bwd Packet', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'SYN Flag Count', 'ACK Flag Count', 'Down/Up Ratio',
       'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg',
       'Bw

In [7]:
length = len(df)

ROW_COUNTER = 500_000

i = 0
counter = 0
while i < length:
    counter += 1
    if i + ROW_COUNTER > length:
        df_temp = df.iloc[i:length]
        print(f"{i:10d} - {length:10d}")
    else:
        df_temp = df.iloc[i:i + ROW_COUNTER]
        print(f"{i:10d} - {i + ROW_COUNTER:10d}")

    df_temp.to_csv(f"../data_cicids2017/1_sampling/{counter:03d}_cicids2017.csv", index=False)
    i += ROW_COUNTER

         0 -      16257
