# Classification
Classify malware samples according to their labels using their reduced representations.

In [1]:
import tensorflow as tf
print(f"GPU is {'not ' if len(tf.config.list_physical_devices('GPU')) == 0 else ''}available")

2024-04-30 16:30:09.424781: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is available


2024-04-30 16:30:10.705809: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:10.734667: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:10.734711: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
import keras

# Data Processing

The `1-Data-Exploration.ipynb` file should have been run before this, and, as a result, the `processed-data.csv` file should have been generated.

In [3]:
import pandas as pd

In [4]:
raw_df = pd.read_csv("../data/processed-data.csv")
raw_df

Unnamed: 0,label,hash,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,TRICKBOT,f16631469eb35406ef4049d30c763cadda571b25bbdb45...,10.476187,0.187760,7.465795,15.308466,0.0,0.00000,0.0,0.573004,...,8.533947,8.327472,8.284050,19.109735,0.0,20.256151,3.200070,0.0,2.176448,4.019719
1,DARKKOMET,d31a7102cbc54447c251ba62760eb484fd0c9fbb8ea54f...,0.000000,0.000000,0.000000,5.645844,0.0,0.00000,0.0,3.855506,...,1.853672,28.783340,4.066120,0.000000,0.0,3.416591,0.000000,0.0,0.000000,8.837362
2,COBALTSTRIKE,dab956e9c864a84d12e8106a24ac3cf2950394152c62b6...,0.535124,8.481697,0.000000,7.358044,0.0,0.00000,0.0,1.597573,...,0.000000,1.861931,0.000000,5.586996,0.0,0.000000,9.478435,0.0,2.326215,0.000000
3,HIVE,122e397dc3a55143bd276d6ff3bc04a05601fbf390aa52...,10.575252,4.092888,0.000000,9.892773,0.0,3.58558,0.0,0.000000,...,0.000000,0.000000,0.000000,6.631641,0.0,6.494191,16.331840,0.0,8.269161,4.189735
4,REMCOS,30a9e1ca1e35bc557d6b46109822cb6d0a0cf970fb614e...,12.028273,17.616888,12.401561,6.552461,0.0,0.00000,0.0,11.003377,...,0.000000,17.086802,14.339774,24.244026,0.0,13.088219,20.145071,0.0,0.000000,23.489222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,NECONYD,0eee965f286f057a3175797590795bbf99fda65dc8d845...,19.595726,7.555195,14.534891,2.550215,0.0,0.00000,0.0,1.831258,...,0.000000,0.000000,5.556197,8.862435,0.0,7.742065,19.124758,0.0,5.133433,9.574662
1316,DRIDEX,95deb6a8dd79ab42df904e0383ddc96cd30ce9f5da4ba8...,10.834826,3.267542,4.152453,5.834544,0.0,0.00000,0.0,0.000000,...,0.000000,0.000000,7.729246,16.249401,0.0,20.675530,12.258331,0.0,4.397899,1.508195
1317,TRICKBOT,0c6aa0ae05d5fa8bf5a8ea95310be73ee60e55a0ce6864...,0.743422,12.834044,1.282330,0.102441,0.0,0.00000,0.0,6.962776,...,3.398914,0.000000,10.314069,1.099106,0.0,0.000000,6.344897,0.0,5.614013,14.304587
1318,MANSABO,78514a632682d1c07ee4f782302bb6a74f2676f1a91b56...,5.559581,0.000000,0.000000,4.052928,0.0,0.00000,0.0,0.000000,...,0.000000,23.643633,0.000000,0.000000,0.0,21.555307,16.472605,0.0,0.000000,0.000000


For training the model, we don't need the `hash` of the sample.

In [5]:
df = raw_df.drop(columns=["hash"])
df

Unnamed: 0,label,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,dim-08,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,TRICKBOT,10.476187,0.187760,7.465795,15.308466,0.0,0.00000,0.0,0.573004,1.910154,...,8.533947,8.327472,8.284050,19.109735,0.0,20.256151,3.200070,0.0,2.176448,4.019719
1,DARKKOMET,0.000000,0.000000,0.000000,5.645844,0.0,0.00000,0.0,3.855506,8.777343,...,1.853672,28.783340,4.066120,0.000000,0.0,3.416591,0.000000,0.0,0.000000,8.837362
2,COBALTSTRIKE,0.535124,8.481697,0.000000,7.358044,0.0,0.00000,0.0,1.597573,10.726227,...,0.000000,1.861931,0.000000,5.586996,0.0,0.000000,9.478435,0.0,2.326215,0.000000
3,HIVE,10.575252,4.092888,0.000000,9.892773,0.0,3.58558,0.0,0.000000,7.928631,...,0.000000,0.000000,0.000000,6.631641,0.0,6.494191,16.331840,0.0,8.269161,4.189735
4,REMCOS,12.028273,17.616888,12.401561,6.552461,0.0,0.00000,0.0,11.003377,18.732975,...,0.000000,17.086802,14.339774,24.244026,0.0,13.088219,20.145071,0.0,0.000000,23.489222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,NECONYD,19.595726,7.555195,14.534891,2.550215,0.0,0.00000,0.0,1.831258,0.000000,...,0.000000,0.000000,5.556197,8.862435,0.0,7.742065,19.124758,0.0,5.133433,9.574662
1316,DRIDEX,10.834826,3.267542,4.152453,5.834544,0.0,0.00000,0.0,0.000000,6.442443,...,0.000000,0.000000,7.729246,16.249401,0.0,20.675530,12.258331,0.0,4.397899,1.508195
1317,TRICKBOT,0.743422,12.834044,1.282330,0.102441,0.0,0.00000,0.0,6.962776,11.036983,...,3.398914,0.000000,10.314069,1.099106,0.0,0.000000,6.344897,0.0,5.614013,14.304587
1318,MANSABO,5.559581,0.000000,0.000000,4.052928,0.0,0.00000,0.0,0.000000,7.112037,...,0.000000,23.643633,0.000000,0.000000,0.0,21.555307,16.472605,0.0,0.000000,0.000000


Separate into `X` and `y`.

In [6]:
X = df.drop(columns=["label"])
y = df["label"]

In [7]:
print(X.shape)
print(y.shape)

(1320, 32)
(1320,)


# Label Encoding and Train-Test Split

We need to encode the labels for the `y`.

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

We want to save the label encoder for use in the full model.

In [9]:
import pickle

with open("../models/classifier/label-encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

80% of the data will be saved for training, while 20% will be left for testing.

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1056, 32)
(264, 32)
(1056,)
(264,)


# Model Creation

Nothing too special here, we just use several dense layers for classification.

In [12]:
from keras import layers
from keras.models import Sequential

In [13]:
INITIAL_EPOCH = 0

def create_model():
    model = Sequential(name="Classifier")

    model.add(keras.Input((X.shape[1],), name="input"))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(len(label_encoder.classes_), activation="softmax"))

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"]
    )
    return model

In [14]:
model = create_model()
model.summary()

2024-04-30 16:30:11.024049: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:11.024129: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:11.024158: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:11.975290: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 16:30:11.975423: I external/local_xla/xla/stream_executor

Define callbacks.

In [15]:
import os

checkpoint_path = "../models/classifier/checkpoint.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

In [16]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=1, min_delta=1e-4)

Load latest checkpoint if there is one.

In [17]:
# INITIAL_EPOCH = 54
# model = keras.models.load_model(checkpoint_path)

Train the model.

In [18]:
NUM_EPOCHS = 100

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    initial_epoch=INITIAL_EPOCH,
    epochs=NUM_EPOCHS,
    callbacks=[checkpointer, early_stopping]
)

Epoch 1/100


I0000 00:00:1714465812.791008  734378 service.cc:145] XLA service 0x7f39d4004bb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1714465812.791046  734378 service.cc:153]   StreamExecutor device (0): Quadro P1000, Compute Capability 6.1
2024-04-30 16:30:12.809499: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-30 16:30:13.607188: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:08[0m 3s/step - loss: 5.6841 - sparse_categorical_accuracy: 0.0000e+00

I0000 00:00:1714465814.817331  734378 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 4.5610 - sparse_categorical_accuracy: 0.0639    
Epoch 1: val_loss improved from inf to 3.45991, saving model to ../models/classifier/checkpoint.keras
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 76ms/step - loss: 4.5440 - sparse_categorical_accuracy: 0.0669 - val_loss: 3.4599 - val_sparse_categorical_accuracy: 0.2783
Epoch 2/100
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 14ms/step - loss: 3.3665 - sparse_categorical_accuracy: 0.2500
Epoch 2: val_loss improved from 3.45991 to 2.91330, saving model to ../models/classifier/checkpoint.keras
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.1235 - sparse_categorical_accuracy: 0.2935 - val_loss: 2.9133 - val_sparse_categorical_accuracy: 0.3679
Epoch 3/100
[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 14ms/step - loss: 2.4214 - sparse_categorical_accuracy: 0.4375
Epoch 3: val_loss improved

<keras.src.callbacks.history.History at 0x7f3a5806e400>

Load the best performing model.

In [19]:
model = keras.models.load_model(checkpoint_path)

In [20]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Testing loss:      {test_loss:5.5f}")
print(f"Testing accuracy:  {test_acc:5.5f}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 1.7348 - sparse_categorical_accuracy: 0.6275
Testing loss:      1.90016
Testing accuracy:  0.60606


In [21]:
model.save("../models/classifier/classifier.keras")