In [1]:
import pandas as pd
import numpy as np
from flowenv.src.const import Const
from pathlib import Path
from imblearn.over_sampling import SMOTENC

CONST = Const()
TRAIN_DATA_PATH = "./DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Training_Balanced.csv"
TEST_DATA_PATH = "./DNP3_Intrusion_Detection_Dataset_Final/Training_Testing_Balanced_CSV_Files/CICFlowMeter/CICFlowMeter_Testing_Balanced.csv"

# TRAIN_DATA_PATH = Path(__file__).resolve().parent.joinpath(TRAIN_DATA_PATH)
# TEST_DATA_PATH = Path(__file__).resolve().parent.joinpath(TEST_DATA_PATH)

def min_max_p(p):
    min_p = p.min()
    max_p = p.max()
    return (p - min_p) / (max_p - min_p)

In [2]:
train_data = pd.read_csv(TRAIN_DATA_PATH).replace([np.inf, -np.inf], np.nan).dropna(how="all").dropna(how="all", axis=1)
test_data = pd.read_csv(TEST_DATA_PATH).dropna(how="all").replace([np.inf, -np.inf], np.nan).dropna(how="all", axis=1)

train_data["Binary Label"] = train_data["Label"] == "NORMAL"
test_data["Binary Label"] = test_data["Label"] == "NORMAL"

for label in CONST.normalization_features:
    train_data[label] = min_max_p(train_data[label])
    test_data[label] = min_max_p(test_data[label])

train_data = train_data.dropna(how="any")
test_data = test_data.dropna(how="any")

In [4]:
train_data = train_data.filter(items=CONST.features_labels + ["Binary Label"])

In [5]:
print(train_data["Binary Label"].value_counts())

Binary Label
False    4659
True      466
Name: count, dtype: int64


In [6]:
categorycal_features = ["Dst Port", "Protocol"]

smote = SMOTENC(
    categorical_features=[train_data.columns.get_loc(label) for label in categorycal_features], 
    random_state=42
)

X_train = train_data.drop(columns=["Binary Label"])
y_train = train_data["Binary Label"]

X_train, y_train = smote.fit_resample(X_train, y_train)

In [7]:
X_resampled = pd.DataFrame(X_train, columns=CONST.features_labels)
y_resampled = pd.Series(y_train, name="Binary Label")

resampled_train_data = pd.concat([X_resampled, y_resampled], axis=1)
