In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

Using device: cuda


In [None]:
class CustomDataset(Dataset):
    def __init__(self, file_path, save_path=None):
        self.data = pd.read_csv(file_path)
        self.columns = self.data.columns
        self.X, self.Y = self.process_data()
        if save_path:
            self.save_data(save_path)

    def save_data(self, save_path):
        data = torch.cat((self.X, self.Y), dim=1)
        column_names = [f'feature {i + 1}' for i in range(62)] + ['label']
        df = pd.DataFrame(data.numpy(), columns=column_names)
        df.to_csv(save_path, index=False)

    def process_data(self):
        columns_to_drop = [col for col in self.data if self.data[col].nunique() == 1]
        redundant_columns = ['Unnamed: 0', 'RunTime', 'Mean', 'Sum', 'Min', 'Max']
        too_much_nan_columns = ['dTos', 'dDSb', 'dTtl', 'dHops', 'SrcGap', 'DstGap', 'SrcWin', 'DstWin', 'sVid', 'dVid', 'SrcTCPBase', 'DstTCPBase']
        columns_to_drop.extend(redundant_columns)
        columns_to_drop.extend(too_much_nan_columns)
        self.data.drop(columns=columns_to_drop, axis=1, inplace=True)

        label_map = {'Malicious': 0, 'Benign': 1}
        self.data['Label'] = self.data['Label'].map(label_map)

        cause_map = {v: k for k, v in enumerate(['Status', 'Start', 'Shutdown'])}
        self.data['Cause'] = self.data['Cause'].map(cause_map)

        protocol_map = {v: k for k, v in enumerate(['udp', 'tcp', 'icmp', 'sctp', 'lldp', 'llc', 'arp', 'ipv6-icmp'])}
        self.data['Proto'] = self.data['Proto'].map(protocol_map)

        state_map = {v: k for k, v in enumerate(['REQ', 'INT', 'CON', 'RST', 'FIN', 'ECO', 'ACC', 'URP', 'RSP', 'TST', 'NRS'])}
        self.data['State'] = self.data['State'].map(state_map)

        self.data['sDSb'].fillna('empty', inplace=True)
        sDSb_map = {v: k for k, v in enumerate(['cs0', 'af41', 'ef', 'af11', 'cs7', '4', 'cs6', '39', '52', '54', 'cs4', 'af12', 'empty'])}
        self.data['sDSb'] = self.data['sDSb'].map(sDSb_map)


        remaining_nan_columns = ['sTos', 'sTtl', 'sHops']

        for col in remaining_nan_columns:
            col_mean = self.data[col].mean()
            self.data[col].fillna(col_mean, inplace=True)

        Y = self.data[['Label']]
        X = self.data.drop(['Label', 'Attack Type', 'Attack Tool'], axis=1)

        X, Y = self.encode_data(X, Y)

        return X, Y

    def normalize_data(self, X_continuous):
        # scaler = StandardScaler()
        scaler = MinMaxScaler(feature_range=(-1, 1))
        X_continuous_scaled = scaler.fit_transform(X_continuous) # np array
        X_continuous_scaled = torch.FloatTensor(X_continuous_scaled)
        return X_continuous_scaled

    def encode_data(self, X, Y):
        self.categorical_features = ['Cause', 'Proto', 'State', 'sDSb']
        X_categorical = X[self.categorical_features]
        X_continuous = X.drop(self.categorical_features, axis=1)
        self.continuous_features = X_continuous.columns

        X_continuous_scaled = self.normalize_data(X_continuous)

        for col_idx, column in enumerate(X_continuous.columns):
            col_min = X_continuous_scaled[:, col_idx].min().item()
            col_max = X_continuous_scaled[:, col_idx].max().item()
            col_min2 = X_continuous[column].min()
            col_max2 = X_continuous[column].max()

        # X_continuous = torch.FloatTensor(X_continuous.values)
        X_categorical = torch.FloatTensor(X_categorical.values).long()

        one_hot_columns = []
        for col_idx in range(X_categorical.shape[1]):
            one_hot_col = F.one_hot(X_categorical[:, col_idx])
            one_hot_columns.append(one_hot_col)
        X_categorical = torch.cat(one_hot_columns, dim=1)

        X_encoded = torch.cat((X_categorical, X_continuous_scaled), dim=1)

        Y = torch.Tensor(Y.values)

        return X_encoded, Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [None]:
d = CustomDataset('drive/MyDrive/datasets/Combined.csv')

In [None]:
!cp processed.csv drive/MyDrive/processed.csv

In [None]:
dataloader = DataLoader(d, batch_size=64, shuffle=True)

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Sequential(nn.Linear(input_size, 1024),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(1024, 768),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(768, 512),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(512, 256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(256, 128),
                                 nn.ReLU(),
                                 nn.Dropout(0.2))
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        output = self.sigmoid(x)
        return output

In [None]:
input_size = 62
epochs = 10

In [None]:
model = MLP(input_size)
model.to(device)

MLP(
  (fc1): Sequential(
    (0): Linear(in_features=62, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=768, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=768, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=256, out_features=128, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.2, inplace=False)
  )
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
loss_fn.to(device)

BCELoss()

In [None]:
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for i, data in enumerate(tqdm(dataloader)):
        inputs, labels = data
        # labels = labels.squeeze(1)
        batch_size = inputs.size(0)

        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        predicted_labels = (outputs >= 0.5).float()

        running_loss += loss.item()
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += batch_size

        loss.backward()
        optimizer.step()
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_predictions / total_samples
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

100%|██████████| 18999/18999 [01:01<00:00, 307.36it/s]


Epoch [1/10], Loss: 0.0533, Accuracy: 0.9918


100%|██████████| 18999/18999 [00:57<00:00, 331.52it/s]


Epoch [2/10], Loss: 0.0431, Accuracy: 0.9952


100%|██████████| 18999/18999 [00:58<00:00, 327.29it/s]


Epoch [3/10], Loss: 0.0402, Accuracy: 0.9959


100%|██████████| 18999/18999 [00:57<00:00, 332.52it/s]


Epoch [4/10], Loss: 0.0406, Accuracy: 0.9964


100%|██████████| 18999/18999 [00:57<00:00, 330.10it/s]


Epoch [5/10], Loss: 0.0410, Accuracy: 0.9965


100%|██████████| 18999/18999 [00:57<00:00, 329.22it/s]


Epoch [6/10], Loss: 0.0445, Accuracy: 0.9964


100%|██████████| 18999/18999 [00:58<00:00, 326.41it/s]


Epoch [7/10], Loss: 0.0425, Accuracy: 0.9967


100%|██████████| 18999/18999 [00:57<00:00, 327.67it/s]


Epoch [8/10], Loss: 0.0444, Accuracy: 0.9965


100%|██████████| 18999/18999 [00:56<00:00, 334.71it/s]


Epoch [9/10], Loss: 0.0477, Accuracy: 0.9959


100%|██████████| 18999/18999 [00:56<00:00, 334.72it/s]

Epoch [10/10], Loss: 0.0522, Accuracy: 0.9958





In [None]:
true_labels = []
predictions = []

model.eval()
with torch.no_grad():
    for data in tqdm(dataloader):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)

        predicted_labels = (outputs >= 0.5).float()

        true_labels.extend(labels.cpu().numpy())
        predictions.extend(predicted_labels.cpu().numpy())
f1 = f1_score(true_labels, predictions)
print(f"F1 Score on Train Data: {f1:.4f}")

100%|██████████| 18999/18999 [00:23<00:00, 799.92it/s]


F1 Score on Train Data: 0.9962
