In [1]:
import os
from glob import glob

import pandas as pd
import flow_package as f_p
import imblearn.over_sampling as im_os
import imblearn.under_sampling as im_us

In [2]:
def read_csv(files):
    df = pd.DataFrame()
    for file in files:
        if os.path.exists(file):
            df = pd.concat([df, pd.read_csv(file)], ignore_index=True)
        else:
            print(f"File not found: {file}")
    return df

In [3]:
SIZE = 5796
PATH = os.path.abspath("./raw_after_filtered/cicids2017/data")
files = glob(os.path.join(PATH, "*.csv"))

df = read_csv(files)
df = df.dropna(how="any").dropna(how="all", axis=1)

counts = df['Number Label'].value_counts()
print(counts)
over_labels = [label for label, count in counts.items() if count <= SIZE]
umder_labels = [label for label, count in counts.items() if count >= SIZE]

df_over = df[df['Number Label'].isin(over_labels)]
df_under = df[df['Number Label'].isin(umder_labels)]

print("over")
print(df_over['Number Label'].value_counts())
print("under")
print(df_under['Number Label'].value_counts())

Number Label
0     2271320
11     230124
8      158804
1      128025
12      10293
5        7935
6        5897
9        5796
10       5499
7        1956
2        1507
3         652
14         36
4          21
13         11
Name: count, dtype: int64
over
Number Label
9     5796
10    5499
7     1956
2     1507
3      652
14      36
4       21
13      11
Name: count, dtype: int64
under
Number Label
0     2271320
11     230124
8      158804
1      128025
12      10293
5        7935
6        5897
9        5796
Name: count, dtype: int64


In [4]:
cnn = im_us.RandomUnderSampler(
    random_state=42,
)

df_under_x, df_under_y = cnn.fit_resample(
    df_under.drop(columns=['Number Label']),
    df_under['Number Label'],
)
df_under = pd.concat([df_under_x, df_under_y], axis=1)
print(df_under['Number Label'].value_counts())

ada = im_os.SMOTE(
    random_state=42
)

df_over_x, df_over_y = ada.fit_resample(
    df_over.drop(columns=['Number Label']),
    df_over['Number Label'],
)
df_over = pd.concat([df_over_x, df_over_y], axis=1)
print(df_over['Number Label'].value_counts())



Number Label
0     5796
1     5796
5     5796
6     5796
8     5796
9     5796
11    5796
12    5796
Name: count, dtype: int64




Number Label
13    5796
14    5796
9     5796
10    5796
2     5796
3     5796
4     5796
7     5796
Name: count, dtype: int64


In [5]:
hybrid_sample = pd.concat([df_over, df_under]).drop(columns=["Unnamed: 0"])
hybrid_sample = hybrid_sample.sample(frac=1).reset_index(drop=True)

hybrid_sample.to_csv("./hybrid_sample.csv")