In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn

# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Paths to directories containing processed sequences
data_root = '/workspace/sequence2/'
class_folders = ['Adware', 'Benign', 'Bankingware', 'Riskware', 'Smsware']


Using device: cuda


In [4]:
# Label encoding for class names
label_encoder = LabelEncoder()
label_encoder.fit(class_folders)  # Assigns a unique integer to each class

# Define padding index based on the number of unique syscalls
padding_index = len(label_encoder.classes_)

# Limit maximum sequence length to prevent memory overflow
MAX_SEQUENCE_LENGTH = 500  # Adjust as needed based on available memory

class SyscallDataset(Dataset):
    def __init__(self, data_root, class_folders, label_encoder, max_seq_length=MAX_SEQUENCE_LENGTH):
        self.data = []
        self.label_encoder = label_encoder
        self.max_seq_length = max_seq_length

        # Load sequences and assign labels based on folder name
        for folder in class_folders:
            folder_path = os.path.join(data_root, folder)
            label = label_encoder.transform([folder])[0]  # Get the encoded label for the class

            for filename in os.listdir(folder_path):
                if filename.endswith('_sequence.txt'):
                    file_path = os.path.join(folder_path, filename)
                    with open(file_path, 'r') as f:
                        # Load the syscall sequence and convert it to a list of integers
                        sequence = [int(index) for index in f.read().strip().split()]
                        # Truncate the sequence to the maximum length
                        sequence = sequence[:self.max_seq_length]
                        # Ensure all indices are within valid bounds, replacing out-of-bounds indices with padding index
                        sequence = [idx if idx < padding_index else padding_index for idx in sequence]
                        self.data.append((sequence, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, label = self.data[idx]
        return torch.tensor(sequence, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [5]:
# Initialize dataset and dataloader with smaller batch size
dataset = SyscallDataset(data_root, class_folders, label_encoder)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=lambda x: collate_fn(x))  # Use smaller batch size

# Padding function to ensure batch sequences have uniform length
def collate_fn(batch):
    sequences, labels = zip(*batch)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=padding_index)
    labels = torch.tensor(labels)
    return padded_sequences.to(device), labels.to(device)

print("DataLoader initialized with padding.")


DataLoader initialized with padding.


In [6]:
class SyscallLSTMClassifier(nn.Module):
    def __init__(self, num_syscalls, embedding_dim, hidden_dim, output_dim):
        super(SyscallLSTMClassifier, self).__init__()
        # Use an embedding layer to represent syscalls as dense vectors, with padding_idx set to padding_index
        self.embedding = nn.Embedding(num_syscalls + 1, embedding_dim, padding_idx=padding_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)  # Apply embedding layer to input
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Use the last hidden state for classification
        return output

# Hyperparameters
embedding_dim = 128
hidden_dim = 64
output_dim = len(class_folders)  # Number of classes
num_syscalls = padding_index  # Total number of unique syscalls

# Initialize model, loss function, and optimizer
model = SyscallLSTMClassifier(num_syscalls, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Model initialized and moved to device.")


Model initialized and moved to device.


In [7]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for sequences, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

print("Training complete.")


Epoch 1/100, Loss: 1.5873
Epoch 2/100, Loss: 1.5713
Epoch 3/100, Loss: 1.5638
Epoch 4/100, Loss: 1.5591
Epoch 5/100, Loss: 1.5523
Epoch 6/100, Loss: 1.5168
Epoch 7/100, Loss: 1.5152
Epoch 8/100, Loss: 1.4879
Epoch 9/100, Loss: 1.4673
Epoch 10/100, Loss: 1.4335
Epoch 11/100, Loss: 1.4427
Epoch 12/100, Loss: 1.4173
Epoch 13/100, Loss: 1.4034
Epoch 14/100, Loss: 1.4528
Epoch 15/100, Loss: 1.4662
Epoch 16/100, Loss: 1.4109
Epoch 17/100, Loss: 1.3869
Epoch 18/100, Loss: 1.3775
Epoch 19/100, Loss: 1.3678
Epoch 20/100, Loss: 1.3639
Epoch 21/100, Loss: 1.3659
Epoch 22/100, Loss: 1.3523
Epoch 23/100, Loss: 1.3691
Epoch 24/100, Loss: 1.3550
Epoch 25/100, Loss: 1.3486
Epoch 26/100, Loss: 1.3428
Epoch 27/100, Loss: 1.4050
Epoch 28/100, Loss: 1.4253
Epoch 29/100, Loss: 1.3788
Epoch 30/100, Loss: 1.3426
Epoch 31/100, Loss: 1.3390
Epoch 32/100, Loss: 1.3313
Epoch 33/100, Loss: 1.3510
Epoch 34/100, Loss: 1.3296
Epoch 35/100, Loss: 1.3168
Epoch 36/100, Loss: 1.3240
Epoch 37/100, Loss: 1.3671
Epoch 38/1

In [11]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [12]:
def evaluate(model, dataloader, label_encoder):
    model.eval()
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for sequences, labels in dataloader:
            outputs = model(sequences)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    
    # Generate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    report = classification_report(
        all_labels,
        all_predictions,
        target_names=label_encoder.classes_,
        digits=4
    )
    
    print(f'Overall Accuracy: {accuracy * 100:.2f}%')
    print("Classification Report:\n", report)


In [13]:
# Run evaluation and print per-label metrics
evaluate(model, dataloader, label_encoder)


Overall Accuracy: 24.36%
Classification Report:
               precision    recall  f1-score   support

      Adware     0.1876    0.1081    0.1371      1175
 Bankingware     0.2238    0.4365    0.2959      1205
      Benign     0.7500    0.0280    0.0541      1070
    Riskware     0.4821    0.1526    0.2318      1147
     Smsware     0.2343    0.4593    0.3103      1215

    accuracy                         0.2436      5812
   macro avg     0.3756    0.2369    0.2058      5812
weighted avg     0.3665    0.2436    0.2096      5812

