In [3]:
import os
from sklearn.model_selection import train_test_split

root_dir = "/workspace/Transformer/Databig"  # Replace with your root folder path
labels = {"Adware": 1, "Bankingware": 2, "Benign": 0, "Smsware": 3, "Riskware": 4}
data = []
target = []

# Extract data and labels
for label_name, label_id in labels.items():
    folder_path = os.path.join(root_dir, label_name)
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(folder_path, file_name), 'r') as file:
                syscall_sequence = file.read().strip().split()  # List of syscalls
                data.append(syscall_sequence)
                target.append(label_id)  # Numerical label

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)


In [4]:
from collections import Counter

# Flatten all sequences to create a vocabulary
all_syscalls = [syscall for sequence in data for syscall in sequence]
syscall_counts = Counter(all_syscalls)
vocab = {syscall: idx + 1 for idx, syscall in enumerate(syscall_counts.keys())}  # Start index from 1


In [5]:
# Convert syscall sequences in train and test data to integer IDs
X_train_tokenized = [[vocab[syscall] for syscall in sequence if syscall in vocab] for sequence in X_train]
X_test_tokenized = [[vocab[syscall] for syscall in sequence if syscall in vocab] for sequence in X_test]


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_length = 1000 # Define a maximum sequence length based on your dataset

# Pad sequences
X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_seq_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_seq_length, padding='post', truncating='post')


2024-11-22 05:40:43.082608: E external/local_xla/xla/stream_executor/plugin_registry.cc:91] Invalid plugin kind specified: FFT
2024-11-22 05:40:43.362363: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-22 05:40:45.159711: E external/local_xla/xla/stream_executor/plugin_registry.cc:91] Invalid plugin kind specified: DNN


In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Add

# Transformer block
def transformer_block(inputs, num_heads, ff_dim, dropout_rate):
    # Multi-head attention
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = Add()([inputs, attention_output])
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output)
    
    # Feed-forward network
    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(inputs.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    ffn_output = Add()([attention_output, ffn_output])
    ffn_output = LayerNormalization(epsilon=1e-6)(ffn_output)
    return ffn_output

# Build Transformer model
def build_transformer_model(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_classes, dropout_rate=0.1):
    inputs = Input(shape=(max_len,))  # Define input shape here
    embedding_layer = Embedding(input_dim=vocab_size + 1, output_dim=embed_dim)(inputs)  # Removed `input_length`
    
    # Add transformer blocks
    x = transformer_block(embedding_layer, num_heads=num_heads, ff_dim=ff_dim, dropout_rate=dropout_rate)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes, activation="softmax")(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# Parameters for the model
embed_dim = 256
num_heads = 4
ff_dim = 128
num_classes = len(labels)
dropout_rate = 0

# Instantiate and compile the model
model = build_transformer_model(len(vocab), max_seq_length, embed_dim, num_heads, ff_dim, num_classes, dropout_rate)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


2024-11-22 05:40:47.839437: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-22 05:40:47.839506: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-22 05:40:48.069245: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-22 05:40:48.069303: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-22 05:40:48.069339: I external/local_xla/xla/stream_executor/rocm/rocm_executor.

In [8]:
import numpy as np

# Ensure the input data is in NumPy array format
X_train_padded = np.array(X_train_padded)
y_train = np.array(y_train)

# Train the Transformer model
history = model.fit(X_train_padded, y_train, epochs=100, batch_size=16, validation_split=0.1)


Epoch 1/100


I0000 00:00:1732254049.696345    1672 service.cc:145] XLA service 0x7a7eb40157c0 initialized for platform ROCM (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732254049.696370    1672 service.cc:153]   StreamExecutor device (0): Radeon RX 7900 GRE, AMDGPU ISA version: gfx1100
2024-11-22 05:40:49.745341: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  7/264[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 23ms/step - accuracy: 0.2349 - loss: 2.1285   

I0000 00:00:1732254065.995685    1672 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 56ms/step - accuracy: 0.5075 - loss: 1.2455 - val_accuracy: 0.6389 - val_loss: 0.9814
Epoch 2/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.6709 - loss: 0.9026 - val_accuracy: 0.7244 - val_loss: 0.8267
Epoch 3/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7367 - loss: 0.7290 - val_accuracy: 0.6859 - val_loss: 0.9871
Epoch 4/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7557 - loss: 0.6923 - val_accuracy: 0.7350 - val_loss: 0.7145
Epoch 5/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7738 - loss: 0.6148 - val_accuracy: 0.7821 - val_loss: 0.6882
Epoch 6/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7675 - loss: 0.6378 - val_accuracy: 0.7778 - val_loss: 0.7415
Epoch 7/100
[1m264/264[0m

In [13]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np

# Assuming X_test_padded is the padded test data and y_test is the true labels
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)  # Get the predicted class labels

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Adware', 'Bankingware', 'Benign', 'Smsware', 'Riskware']))

# Overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m 1/37[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 7ms/step

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Classification Report:
              precision    recall  f1-score   support

      Adware       0.72      0.83      0.77       221
 Bankingware       0.84      0.76      0.80       251
      Benign       0.89      0.84      0.87       254
     Smsware       0.95      0.97      0.96       233
    Riskware       0.89      0.89      0.89       211

    accuracy                           0.86      1170
   macro avg       0.86      0.86      0.86      1170
weighted avg       0.86      0.86      0.86      1170

Overall Accuracy: 0.8564102564102564
Confusion Matrix:
[[184  12   9   4  12]
 [ 37 191  12   3   8]
 [ 25   9 213   4   3]
 [  1   3   1 227   1]
 [  8  13   3   0 187]]
