In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import captum.attr as c
import time
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import os
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import NearMiss

In [3]:
data = pd.read_csv('Cleaned_full_data.csv')



In [4]:
# Reset retained index.
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')

In [5]:
# Avoids scaling binaries.
data = data.drop(columns=['proto', 'dsport', 'service', 'state', 'srcip', 'sport', 'dstip'])
temp = data[['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat']]
data = data.drop(columns=['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat'])

In [6]:
# Add encoded values for correlation encoder. The encoded values are based on all threats vs Normal labels.
ohe1 = pd.read_csv('Full_proto_encoded.csv')
ohe2 = pd.read_csv('Full_dsport_encoded.csv')
ohe3 = pd.read_csv('Full_service_encoded.csv')
ohe4 = pd.read_csv('Full_state_encoded.csv')
# Spelling error.
ohe5 = pd.read_csv('Full_scrip_encoded.csv')
#------------------------------------------#
ohe6 = pd.read_csv('Full_sport_encoded.csv')
ohe7 = pd.read_csv('Full_dstip_encoded.csv')

In [7]:
# MinMax seperates Normal data well and reduces noise. Please see Kmeans TSNE evaluation in Tools.
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
data = pd.DataFrame(scaled_data, columns=data.columns)
# Combine data after scaling.
data = pd.concat([data, temp, ohe1, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, data['label'], test_size=0.2, random_state=42)
# Drop attack cat before running model and store for later evaluation indexing.
test_attack_cat = X_test['attack_cat']
print(X_test['label'].value_counts())
# Drop labels from X sets pre training.
X_train = X_train.drop(columns=['attack_cat', 'label'])
X_test = X_test.drop(columns=['attack_cat', 'label'])

label
0    443831
1     64179
Name: count, dtype: int64


In [15]:
# Prepare batches.
batch_size = 1024
# Convert to tensors. BCE requires float type.
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
# Add to Dataloader to manage batch processing.
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
print(len(train_loader))

1985


In [18]:
from torch.utils.data import DataLoader, TensorDataset
from imblearn.over_sampling import SMOTE
import torch
train_list = []
label_list = []
for batch_idx, (train_seq, train_label) in enumerate(train_loader):
    batch_size = len(train_seq)
    smote_neighbors = min(batch_size, 2)

    if batch_size > 1:
        try:
            smote = SMOTE(random_state=42, k_neighbors=smote_neighbors)
            train_smote, train_label_smote = smote.fit_resample(train_seq.numpy(), train_label.numpy())
            train_smote = torch.tensor(train_smote, dtype=torch.float32)
            train_label_smote = torch.tensor(train_label_smote, dtype=torch.float32)
        except ValueError as e:
            print(f"Error running SMOTE: {e}")
            train_smote, train_label_smote = train_seq, train_label
    else:
        train_smote, train_label_smote = train_seq, train_label
    train_list.append(train_smote)
    label_list.append(train_label_smote)

In [19]:
print(len(train_list))

1985


In [22]:
train_full = torch.cat(train_list, dim=0)

In [24]:
print(len(train_full))

3549866
