In [1]:
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from collections import Counter
from glob import glob
from tqdm import tqdm
import numpy as np

In [2]:
def process_data(df, is_drop=True):
    df = df.drop(columns=[
        "Flow ID",
        "Src IP",
    ])

    if is_drop:
        df = df.drop(columns=[
            "Dst IP",
            "Timestamp"
        ])
    else:
        # Timestamp
        import datetime
        # 2017-07-07 11:59:50.315195 -> 1500000000.315195
        DATEFORMAT = "%Y-%m-%d %H:%M:%S.%f"
        df["continuous_timestamp"] = df["Timestamp"].apply(
            lambda x: datetime.strptime(x, DATEFORMAT).timestamp()
        )
        df = df.drop(columns=[
            "Timestamp",
        ])
        df = df.rename(columns={
            "continuous_timestamp": "Timestamp"
        })
        # Dst IP
        import ipaddress as ip
        df["destination_ip"] = df["Dst IP"].apply(
            lambda x: int(ip.IPv4Address(x))
        )
        df = df.drop(columns=[
            "Dst IP",
        ])
        df = df.rename(columns={
            "destination_ip": "Dst IP"
        })
    return df
    

In [3]:
def read_data(pattern=0):
    if pattern == 0:
        file_path = "../data_cicids2017/0_raw/friday.csv"
        df = pd.read_csv(file_path)
        df = process_data(df)
    else:
        directory_path = "../data_cicids2017/0_raw"

        df = pd.DataFrame()
        files_path = glob(f"{directory_path}/*.csv")
        print(files_path)
        for file_path in tqdm(files_path):
            df_tmp = pd.read_csv(file_path)
            df_tmp = process_data(df_tmp)
            df_tmp = df_tmp.replace([np.inf, -np.inf], np.nan)
            df_tmp = df_tmp.dropna()
            df = pd.concat([df, df_tmp], axis=0)
    return df

In [5]:
df = read_data(1)
X = df.drop(columns=["Label", "Attempted Category"])
y = df["Label"]

print("Before: ", Counter(y))
result_counter = Counter(y)
third_most_common = result_counter.most_common(3)[-1]
print(f"3番目に多い属性: {third_most_common}")
if third_most_common[1] > 100_000:
    more_than_third_label = [k for k, v in result_counter.items() if v >= 100_000]
    less_than_third_label = [k for k, v in result_counter.items() if v <= 100_000]
else:
    more_than_third_label = [k for k, v in result_counter.items() if v >= third_most_common[1]]
    less_than_third_label = [k for k, v in result_counter.items() if v <= third_most_common[1]]

more_than_third = df[df["Label"].isin(more_than_third_label)]
less_than_third = df[df["Label"].isin(less_than_third_label)]

print(more_than_third["Label"].value_counts())
print(less_than_third["Label"].value_counts())

['../data_cicids2017/0_raw/tuesday.csv', '../data_cicids2017/0_raw/monday.csv', '../data_cicids2017/0_raw/thursday.csv', '../data_cicids2017/0_raw/wednesday.csv', '../data_cicids2017/0_raw/friday.csv']


100%|██████████| 5/5 [00:10<00:00,  2.10s/it]


Before:  Counter({'BENIGN': 1582561, 'Portscan': 159066, 'DoS Hulk': 158468, 'DDoS': 95144, 'Infiltration - Portscan': 71767, 'DoS GoldenEye': 7567, 'Botnet - Attempted': 4067, 'FTP-Patator': 3972, 'DoS Slowloris': 3859, 'DoS Slowhttptest - Attempted': 3368, 'SSH-Patator': 2961, 'DoS Slowloris - Attempted': 1847, 'DoS Slowhttptest': 1740, 'Web Attack - Brute Force - Attempted': 1292, 'Botnet': 736, 'Web Attack - XSS - Attempted': 655, 'DoS Hulk - Attempted': 581, 'DoS GoldenEye - Attempted': 80, 'Web Attack - Brute Force': 73, 'Infiltration - Attempted': 45, 'Infiltration': 36, 'SSH-Patator - Attempted': 27, 'Web Attack - XSS': 18, 'Web Attack - SQL Injection': 13, 'FTP-Patator - Attempted': 12, 'Heartbleed': 11, 'Web Attack - SQL Injection - Attempted': 5})
3番目に多い属性: ('DoS Hulk', 158468)
Label
BENIGN      1582561
Portscan     159066
DoS Hulk     158468
Name: count, dtype: int64
Label
DDoS                                      95144
Infiltration - Portscan                   71767
DoS Go

In [None]:
# oversampling ... less than third
smote_enn = SMOTEENN(
    random_state=42,
    n_jobs=-1,
    smote=SMOTE(
        k_neighbors=2,
        random_state=42,
    )
)
over_x = less_than_third.drop(columns=["Label", "Attempted Category"])
over_y = less_than_third["Label"]
print("oversampling start...")
over_x_res, over_y_res = smote_enn.fit_resample(over_x, over_y)
print("oversampling end...")

over_res = pd.concat([over_x_res, over_y_res], axis=1)
print(over_res["Label"].value_counts())

# undersampling ... more than thrid
nm = NearMiss(
    n_neighbors_ver3=2,
    version=3,
    n_jobs=-1,
    sampling_strategy={
        k: third_most_common[1] for k in more_than_third["Label"].unique()
    }
)
under_x = more_than_third.drop(columns=["Label", "Attempted Category"])
under_y = more_than_third["Label"]
print("undersampling start...")
under_x_res, under_y_res = nm.fit_resample(under_x, under_y)
print("undersampling end...")

under_res = pd.concat([under_x_res, under_y_res], axis=1)
under_res = under_res[under_res["Label"] != third_most_common[0]]
print(under_res["Label"].value_counts())

df = pd.concat([over_res, under_res], axis=0)
del over_res, under_res

oversampling start...


In [None]:
length = len(df)
print(df["Label"].value_counts())

In [7]:
# length = len(df)

# ROW_COUNTER = 500_000

# i = 0
# counter = 0
# while i < length:
#     counter += 1
#     if i + ROW_COUNTER > length:
#         df_temp = df.iloc[i:length]
#         print(f"{i:10d} - {length:10d}")
#     else:
#         df_temp = df.iloc[i:i + ROW_COUNTER]
#         print(f"{i:10d} - {i + ROW_COUNTER:10d}")

#     df_temp.to_csv(f"/home/ishibashi02/test/drl_2/data_cicids2017/1_sampling/{counter:03d}_cicids2017.csv", index=False)
#     i += ROW_COUNTER