In [None]:
import pandas as pd
import numpy as np
from flowenv.src.const import Const
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTENC
import kagglehub as kh


def min_max_p(p):
    min_p = p.min()
    max_p = p.max()
    return (p - min_p) / (max_p - min_p)


CONST = Const()
ATTACK_LABELS = CONST.attack_labels

# TRAIN_DATA_PATH = "../../DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Training_Balanced.csv"
# TEST_DATA_PATH = "../../DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Testing_Balanced.csv"

TRAIN_DATA_PATH = "./cicddos2019/01-12/DrDoS_LDAP.csv"
TEST_DATA_PATH = "./cicddos2019/01-12/DrDoS_LDAP.csv"


# TRAIN_DATA_PATH = Path(__file__).resolve().parent.joinpath(TRAIN_DATA_PATH)
# TEST_DATA_PATH = Path(__file__).resolve().parent.joinpath(TEST_DATA_PATH)

CATEGORICAL_FEATURES = ["Dst Port", "Protocol"]


In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)
test_data = pd.read_csv(TEST_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)

train_data["Binary Label"] = train_data["Label"].apply(lambda label: label in ATTACK_LABELS)
test_data["Binary Label"] = test_data["Label"].apply(lambda label: label in ATTACK_LABELS)

In [None]:


def _read_data(binarize=False, balance=False):
    train_data = pd.read_csv(TRAIN_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)
    test_data = pd.read_csv(TEST_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)

    train_data = train_data.drop_duplicates()

    # unique
    targets = ""
    if binarize:
        train_data["Binary Label"] = train_data["Label"].apply(lambda label: label in ATTACK_LABELS)
        test_data["Binary Label"] = test_data["Label"].apply(lambda label: label in ATTACK_LABELS)
        targets = "Binary Label"
    else:
        train_data["Label Index"] = train_data["Label"].apply(lambda x: 0 if x not in ATTACK_LABELS else ATTACK_LABELS.index(x) + 1)
        test_data["Label Index"] = test_data["Label"].apply(lambda x: 0 if x not in ATTACK_LABELS else ATTACK_LABELS.index(x) + 1)
        targets = "Label Index"
    # --------
    
    train_data = train_data.filter(items=CONST.features_labels + [targets])
    test_data = test_data.filter(items=CONST.features_labels + [targets])

    conbine_data = pd.concat([train_data, test_data], ignore_index=True)
    ohe = OneHotEncoder(sparse_output=False)
    conbine_data_ohe = ohe.fit_transform(conbine_data[CATEGORICAL_FEATURES])
    conbine_data_ohe = pd.DataFrame(conbine_data_ohe, columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES))
    conbine_data = pd.concat([conbine_data.drop(columns=CATEGORICAL_FEATURES), conbine_data_ohe], axis=1)

    train_data = conbine_data.iloc[:len(train_data) - 1]
    test_data = conbine_data.iloc[len(train_data):]

    # train_data_ohe = ohe.fit_transform(train_data[CATEGORICAL_FEATURES])
    # train_data_ohe = pd.DataFrame(train_data_ohe, columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES))
    # train_data = pd.concat([train_data.drop(columns=CATEGORICAL_FEATURES), train_data_ohe], axis=1)
    train_smotenc_columns = ohe.get_feature_names_out(CATEGORICAL_FEATURES).tolist()

    # test_data_ohe = ohe.fit_transform(test_data[CATEGORICAL_FEATURES])
    # test_data_ohe = pd.DataFrame(test_data_ohe, columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES))
    # test_data = pd.concat([test_data.drop(columns=CATEGORICAL_FEATURES), test_data_ohe], axis=1)

    for label in CONST.normalization_features:
        train_data.loc[:, label] = min_max_p(train_data[label]).astype(train_data[label].dtype)
        test_data.loc[:, label] = min_max_p(test_data[label]).astype(test_data[label].dtype)
    
    train_data = train_data.dropna(how="any")
    test_data = test_data.dropna(how="any")
    
    if balance:
        X_train = train_data.drop(columns=[targets])
        y_train = train_data[targets]

        smote = SMOTENC(
            categorical_features=[X_train.columns.get_loc(label) for label in train_smotenc_columns], 
            random_state=42,
            k_neighbors=3
        )

        X_train, y_train = smote.fit_resample(X_train, y_train)

        columns_name = list(train_data.columns)
        X_resampled = pd.DataFrame(X_train)
        y_resampled = pd.Series(y_train, name=targets)

        resampled_train_data = pd.concat([X_resampled, y_resampled], axis=1)

        return resampled_train_data, test_data
    else:
        return train_data, test_data


def using_nonbalanced_data():
    return _read_data(binarize=True)

def using_data():
    return _read_data(binarize=True, balance=True)


def using_multiple_data():
    index_info = ["Normal"] + ATTACK_LABELS

    return _read_data(binarize=False, balance=True), index_info

def label_info():
    return ["Normal"] + ATTACK_LABELS

In [1]:
import matplotlib
import random

from collections import deque, namedtuple
from itertools import count
from time import time

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils as utils
import torch.optim as optim
import pandas as pd

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

In [None]:
device_name = "cpu"

if False:
    if torch.cuda.is_available():
        device_name = "cuda"
    elif torch.mps.is_available():
        device_name = "mps"
    # elif torch.hip.is_available():
    #     device_name = "hip"
    elif torch.mtia.is_available():
        device_name = "mtia"
    elif torch.xpu.is_available():
        device_name = "xpu"

device = torch.device(device_name)
print(f"device: {device_name}")

device: cpu


: 

In [None]:
import sys
# sys.path.append(r"/Users/toshi/Documents/school/machine-learning")
# sys.path.append(r"C:\Users\takat\PycharmProjects\machine-learning")
sys.path.append("/Users/toshi_pro/Documents/github-sub/machine-learning")

import flowdata
import flowenv

raw_data_train, raw_data_test = flowdata.flow_data.using_data()


# train_env = gym.make("flowenv/FlowTrain-v0", data=raw_data_train)
train_env = gym.make("flowenv/Flow-v1", data=raw_data_train)
# test_env = gym.make("flowenv/FlowTest-v0", data=raw_data_test)
test_env = gym.make("flowenv/Flow-v1", data=raw_data_test)

  from .autonotebook import tqdm as notebook_tqdm
  train_data = pd.read_csv(TRAIN_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)
  test_data = pd.read_csv(TEST_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="any").dropna(how="all", axis=1)


Index(['Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port', 'Destination IP',
       'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Le