
### The Data

| Column     | Description              |
|------------|--------------------------|
|`processId`|The unique identifier for the process that generated the event - int64 |
|`threadId`|ID for the thread spawning the log - int64|
|`parentProcessId`|Label for the process spawning this log - int64|
|`userId`|ID of user spawning the log|Numerical - int64|
|`mountNamespace`|Mounting restrictions the process log works within - int64|
|`argsNum`|Number of arguments passed to the event - int64|
|`returnValue`|Value returned from the event log (usually 0) - int64|
|`sus_label`|Binary label as suspicous event (1 is suspicious, 0 is not) - int64|

More information on the dataset: [BETH dataset](accreditation.md)

In [1]:
#Â Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as functional
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torchmetrics import Accuracy


In [2]:
train_df = pd.read_csv('labelled_train.csv')
test_df = pd.read_csv('labelled_test.csv')
val_df = pd.read_csv('labelled_validation.csv')

# View the first 5 rows of training set
train_df.head()

Unnamed: 0,processId,threadId,parentProcessId,userId,mountNamespace,argsNum,returnValue,sus_label
0,381,7337,1,100,4026532231,5,0,1
1,381,7337,1,100,4026532231,1,0,1
2,381,7337,1,100,4026532231,0,0,1
3,7347,7347,7341,0,4026531840,2,-2,1
4,7347,7347,7341,0,4026531840,4,0,1


In [3]:
# Separate features and labels
y_train = train_df['sus_label']
X_train = train_df.drop(columns=['sus_label'])

y_val = val_df['sus_label']
X_val = val_df.drop(columns=['sus_label'])

y_test = test_df['sus_label']
X_test = test_df.drop(columns=['sus_label'])

# Scaling the features and labeled data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Converting to tensors and creating datasets
X_train_tensor = torch.tensor(X_train).float()
y_train_tensor = torch.tensor(y_train.values).float()

X_val_tensor = torch.tensor(X_val).float()
y_val_tensor = torch.tensor(y_val.values).float()

X_test_tensor = torch.tensor(X_test).float()
y_test_tensor = torch.tensor(y_test.values).float()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

input_features = X_train.shape[1]

model = nn.Sequential(
    nn.Linear(input_features, 8),
    nn.ReLU(),
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
    nn.Sigmoid()
)

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

trainloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

num_epochs = 10
print("Starting Training...")

# Training loop

for epoch in range(num_epochs):
    for data in trainloader:
        optimizer.zero_grad()

        features, target = data
        features = features.float()
        target = target.float()

        prediction = model(features)
        loss = criterion(prediction.squeeze(), target)
        loss.backward()
        optimizer.step()
print("Training Finished. Starting Evaluation...")

model.eval()
num_correct = 0
total_samples = 0

validationloader = DataLoader(val_dataset, batch_size=128, shuffle=True)

# Evaluation loop

with torch.no_grad():
    for features, labels in validationloader:
        
        features = features.float()
        labels = labels.float()
        
        output = model(features)

        predicted = torch.round(output.squeeze())
        num_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

val_accuracy = num_correct / total_samples
print(val_accuracy)
model.train()

Starting Training...
Training Finished. Starting Evaluation...
0.9958405435869755


Sequential(
  (0): Linear(in_features=7, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
  (5): Sigmoid()
)