In [None]:
import ipaddress as ip
import pandas as pd
import numpy as np

from glob import glob
from datetime import datetime
# train_test_split
from sklearn.model_selection import train_test_split
import scipy
import os
import csv
# ENN
from imblearn.under_sampling import EditedNearestNeighbours
# K-Means
from sklearn.cluster import KMeans

# WGAN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

DIR_PATH = "./data"

In [None]:
def get_csv_files(dir_name_list):
    for _dir in dir_name_list:
        paths = glob(f"{DIR_PATH}/raw/*.csv")
        
        for path in paths:
            print(path)
            filename = os.path.basename(path)
            read_write(path, f"{DIR_PATH}/raw_after/{_dir[0]}/{filename}", _dir[2])


def read_write(path, write_path, labels=None):
    with open(path, "r", encoding="utf-8") as inputs, open(write_path, "w", newline="") as outputs:
        reader = csv.reader(inputs)
        writer = csv.writer(outputs)

        headers = next(reader)
        if not labels:
            headers = [ head.strip() for head in headers ]
            writer.writerow(headers)
        else:
            writer.writerow(labels)

        for row in reader:
            if row == headers:
                break
            writer.writerow(row)

In [None]:
datasets = [
    ["cicids2017", "BENIGN", None],
]

get_csv_files(datasets)

./raw/cicids2017/DDoS-Friday-WorkingHours-Afternoon.pcap_ISCX.csv
./raw/cicids2017/WebAttacks-Thursday-WorkingHours-Morning.pcap_ISCX.csv
./raw/cicids2017/Benign-Monday-WorkingHours.pcap_ISCX.csv
./raw/cicids2017/Bruteforce-Tuesday-WorkingHours.pcap_ISCX.csv
./raw/cicids2017/Botnet-Friday-WorkingHours-Morning.pcap_ISCX.csv
./raw/cicids2017/Portscan-Friday-WorkingHours-Afternoon.pcap_ISCX.csv
./raw/cicids2017/DoS-Wednesday-WorkingHours.pcap_ISCX.csv
./raw/cicids2017/Infiltration-Thursday-WorkingHours-Afternoon.pcap_ISCX.csv
./raw/csecicids2018/02-14-2018.csv
./raw/csecicids2018/02-15-2018.csv
./raw/csecicids2018/02-20-2018.csv
./raw/csecicids2018/02-21-2018.csv
./raw/csecicids2018/03-01-2018.csv
./raw/csecicids2018/03-02-2018.csv
./raw/csecicids2018/02-23-2018.csv
./raw/csecicids2018/02-22-2018.csv
./raw/csecicids2018/02-16-2018.csv
./raw/csecicids2018/02-28-2018.csv


In [None]:
files = glob(f"{DIR_PATH}/raw_after/cicids2017/*.csv")

df = pd.DataFrame()

for file in files:
    if "label_" in file:
        continue
    tmp = pd.read_csv(file)
    df = pd.concat([df, tmp])

print(df["Label"].unique())

DROP_COLUMNS = [
    "Flow ID",
    "Source IP",
    "Source Port",
    "Bwd PSH Flags",
    "Bwd URG Flags",
    "Fwd Avg Bytes/Bulk",
    "Fwd Avg Packets/Bulk",
    "Fwd Avg Bulk Rate",
    "Bwd Avg Bytes/Bulk",
]
print(len(df))
df = df.drop(columns=DROP_COLUMNS).replace([np.inf, -np.inf], np.nan)
df = df.dropna(how="any").dropna(how="all", axis=1).drop_duplicates()
print(len(df))

['BENIGN' 'DDoS' 'Web Attack Brute Force' 'Web Attack XSS'
 'Web Attack Sql Injection' 'FTP-Patator' 'SSH-Patator' 'Bot' 'PortScan'
 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed' 'Infiltration']


In [4]:
for value in df["Destination IP"].unique()[:5]:
    ip_int = int(ip.IPv4Address(value))
    print(value, ip_int)

192.168.10.5 3232238085
192.168.10.16 3232238096
192.168.10.8 3232238088
192.168.10.25 3232238105
192.168.10.9 3232238089


In [5]:
DATE_FORMAT = "%m/%d/%Y %H:%M"
for value in df["Timestamp"].unique()[:5]:
    dt = datetime.strptime(value, DATE_FORMAT)
    dt = dt.timestamp()
    print(value, "|", dt)

7/7/2017 3:30 | 1499365800.0
7/7/2017 3:31 | 1499365860.0
7/7/2017 3:32 | 1499365920.0
7/7/2017 3:33 | 1499365980.0
7/7/2017 3:34 | 1499366040.0


In [None]:
# Timestamp -> int
df["continuous_time"] = df["Timestamp"].apply(
    lambda x: datetime.strptime(x, DATE_FORMAT).timestamp()
)
df = df.drop(columns=["Timestamp"])
# rename: continuous_time -> timestamp
df = df.rename(columns={"continuous_time": "Timestamp"})

# Destination IP -> int
df["destination_ip"] = df["Destination IP"].apply(
    lambda x: int(ip.IPv4Address(x))
)
df = df.drop(columns=["Destination IP"])
# rename: destination_ip -> Destination IP
df = df.rename(columns={"destination_ip": "Destination IP"})


print("wgan running?: ", end="")
result = input()

if result != "y":
    counts = df["Label"].value_counts()
    for (label, count) in counts.items():
        label = label.strip().replace(" ", "_")
        df[df["Label"] == label].to_csv(f"{DIR_PATH}/raw_after/cicids2017/label_{label}.csv", index=False)

    print("next: wgan running")
    exit()

In [7]:
def min_max(dataframe):
    for column in dataframe.columns:
        # print(dataframe[column].unique())
        if len(dataframe[column].unique()) <= 1:
            # print(f"Column {column} has only one unique value. Skipping normalization.")
            continue
        dataframe[column] = (
            dataframe[column] - dataframe[column].min()
        ) / (dataframe[column].max() - dataframe[column].min())
    return dataframe

def normalize(dataframe):
    categorical_frame = dataframe[["Destination Port", "Protocol", "Label"]]
    other_frame = dataframe.drop(columns=["Destination Port", "Protocol", "Label"])

    # normalize
    normalized_frame = min_max(other_frame)
    normalized_frame = normalized_frame.replace([np.inf, -np.inf], np.nan)

    tmp = pd.concat(
        [normalized_frame, categorical_frame], axis=1
    )
    return tmp

def under_sampling(dataframe, size):
    CLUSTER_SIZE = 1

    # normalize
    normalized_frame = normalize(dataframe)
    print("normalized_frame", normalized_frame.shape)
    normalized_frame = normalized_frame.replace([np.inf, -np.inf], np.nan)

    continuous_frame = normalized_frame.drop(columns=["Destination Port", "Protocol", "Label"])

    kmeans = KMeans(n_clusters=CLUSTER_SIZE, random_state=0)
    print("k-means: start")
    cluster_labels = kmeans.fit_predict(continuous_frame)
    print("k-means: end")

    i = cluster_labels[0]
    cluster_indices = np.where(cluster_labels == i)[0]
    distances = np.linalg.norm(
        continuous_frame.iloc[cluster_indices] - kmeans.cluster_centers_[i], axis=1
    )
    closest_indices = cluster_indices[np.argsort(distances)[:size]]

    return normalized_frame.iloc[closest_indices]

In [None]:
FEATURE_DIM = 74
LATENT_DIM = 100

class FlowDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data_frame = dataframe
        
        # self.features = self.data_frame.drop(columns=["Label"])
        self.features = self.features.select_dtypes(include=[np.number])

        self.transform = transform

    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        features = self.features.iloc[idx].values.astype(np.float32)
        return torch.tensor(features)


class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(LATENT_DIM, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.Linear(1024, FEATURE_DIM),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        return self.model(z)


class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(FEATURE_DIM, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1)
        )
    
    def forward(self, x):
        return self.model(x)


generator = Generator()
critic = Critic()
generator_optimizer = optim.RMSprop(generator.parameters(), lr=0.0001)
critic_optimizer = optim.RMSprop(critic.parameters(), lr=0.0001)


def over_sampling(dataframe):
    dataframe = normalize(dataframe)
    size = 20000 - len(dataframe)
    label = dataframe["Label"].unique()[0]
    label = label.replace(" ", "_")
    
    checkpoint = torch.load(f"{DIR_PATH}/wgan/wgan_cicflowmeter_{label}.pth")
    generator.load_state_dict(checkpoint["generator_state_dict"])
    critic.load_state_dict(checkpoint["critic_state_dict"])
    generator_optimizer.load_state_dict(checkpoint["generator_optimizer_state_dict"])
    critic_optimizer.load_state_dict(checkpoint["critic_optimizer_state_dict"])
    generator.eval()

    with torch.no_grad():
        z = torch.randn(size, LATENT_DIM)
        generated_data = generator(z).numpy()

    generated_df = pd.DataFrame(generated_data, columns=dataframe.columns[:-1])

    return generated_df

In [None]:
MAX_SAMPLE_SIZE = 20000

counts = df["Label"].value_counts()
new_df = pd.DataFrame()
for label, count in counts.items():
    print(label, count)
    if count > MAX_SAMPLE_SIZE:
        # under sampling
        tmp = under_sampling(df[df["Label"] == label], MAX_SAMPLE_SIZE)
        print(tmp.shape)
    else:
        # over sampling
        tmp = over_sampling(df[df["Label"] == label])
        print(tmp.shape)
    print("--" * 20)
    new_df = pd.concat([new_df, tmp])

print("==" * 20)
print(new_df["Label"].value_counts())

BENIGN 2254674
normalized_frame (2254674, 75)
k-means: start
k-means: end
(20000, 75)
----------------------------------------
DoS Hulk 178179
normalized_frame (178179, 75)
k-means: start
k-means: end
(20000, 75)
----------------------------------------
DDoS 128023
normalized_frame (128023, 75)
k-means: start
k-means: end
(20000, 75)
----------------------------------------
PortScan 119922
normalized_frame (119922, 75)
k-means: start
k-means: end
(20000, 75)
----------------------------------------
DoS GoldenEye 10286
(10286, 75)
----------------------------------------
FTP-Patator 6878
(6878, 75)
----------------------------------------
DoS slowloris 5692
(5692, 75)
----------------------------------------
DoS Slowhttptest 5263
(5263, 75)
----------------------------------------
SSH-Patator 5098
(5098, 75)
----------------------------------------
Bot 1954
(1954, 75)
----------------------------------------
Web Attack Brute Force 1507
(1507, 75)
----------------------------------------

In [None]:
print(new_df["Label"].value_counts())

counts = new_df["Label"].value_counts()
# save to csv (separate by label)
for label, count in counts.items():
    print(label, count)
    tmp = new_df[new_df["Label"] == label]
    print(tmp.shape)

    label = label.replace(" ", "_")
    # save to csv
    tmp.to_csv(f"{DIR_PATH}/train/{label}.csv")

Label
BENIGN                      20000
DoS Hulk                    20000
DDoS                        20000
PortScan                    20000
DoS GoldenEye               10286
FTP-Patator                  6878
DoS slowloris                5692
DoS Slowhttptest             5263
SSH-Patator                  5098
Bot                          1954
Web Attack Brute Force       1507
Web Attack XSS                652
Infiltration                   36
Web Attack Sql Injection       21
Heartbleed                     11
Name: count, dtype: int64
BENIGN 20000
(20000, 75)
DoS Hulk 20000
(20000, 75)
DDoS 20000
(20000, 75)
PortScan 20000
(20000, 75)
DoS GoldenEye 10286
(10286, 75)
FTP-Patator 6878
(6878, 75)
DoS slowloris 5692
(5692, 75)
DoS Slowhttptest 5263
(5263, 75)
SSH-Patator 5098
(5098, 75)
Bot 1954
(1954, 75)
Web Attack Brute Force 1507
(1507, 75)
Web Attack XSS 652
(652, 75)
Infiltration 36
(36, 75)
Web Attack Sql Injection 21
(21, 75)
Heartbleed 11
(11, 75)


In [None]:
_, test = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
)
test.to_csv(f"{DIR_PATH}/test/test.csv")