# Classification
Classify malware samples according to their labels using their reduced representations.

In [1]:
import tensorflow as tf
print(f"GPU is {'not ' if len(tf.config.list_physical_devices('GPU')) == 0 else ''}available")

2024-04-30 14:01:20.994112: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is available


2024-04-30 14:01:22.340283: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:22.369546: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:22.369591: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
import keras

# Data Processing

The `1-Data-Exploration.ipynb` file should have been run before this, and, as a result, the `processed-data.csv` file should have been generated.

In [3]:
import pandas as pd

In [4]:
raw_df = pd.read_csv("../data/processed-data.csv")
raw_df

Unnamed: 0,label,hash,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,TRICKBOT,f16631469eb35406ef4049d30c763cadda571b25bbdb45...,0.0,7.417366,0.000000,13.861341,14.981039,0.0,0.0,0.0,...,6.086682,0.0,23.732056,22.293385,9.553692,19.502905,0.000000,0.000000,0.0,0.0
1,DARKKOMET,d31a7102cbc54447c251ba62760eb484fd0c9fbb8ea54f...,0.0,5.675191,0.000000,23.251524,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.000000,8.913853,0.000000,0.000000,0.000000,3.610444,0.0,0.0
2,COBALTSTRIKE,dab956e9c864a84d12e8106a24ac3cf2950394152c62b6...,0.0,8.435247,11.025072,1.910951,2.756680,0.0,0.0,0.0,...,0.000000,0.0,0.000000,1.928727,3.386528,14.165046,1.996442,0.000000,0.0,0.0
3,HIVE,122e397dc3a55143bd276d6ff3bc04a05601fbf390aa52...,0.0,7.161143,7.466067,0.000000,12.057423,0.0,0.0,0.0,...,2.785359,0.0,6.948788,0.000000,10.238041,7.976179,0.000000,0.685886,0.0,0.0
4,REMCOS,30a9e1ca1e35bc557d6b46109822cb6d0a0cf970fb614e...,0.0,13.437019,6.879735,5.775038,0.000000,0.0,0.0,0.0,...,0.000000,0.0,10.821386,22.139019,0.000000,6.127213,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,NECONYD,0eee965f286f057a3175797590795bbf99fda65dc8d845...,0.0,6.819577,13.081622,3.350442,21.139126,0.0,0.0,0.0,...,9.952506,0.0,10.925339,2.663227,3.490541,3.282700,0.000000,0.000000,0.0,0.0
1265,DRIDEX,95deb6a8dd79ab42df904e0383ddc96cd30ce9f5da4ba8...,0.0,9.382783,4.581702,2.148845,13.126667,0.0,0.0,0.0,...,4.992890,0.0,22.065388,7.085610,7.298818,15.700541,0.000000,0.000000,0.0,0.0
1266,TRICKBOT,0c6aa0ae05d5fa8bf5a8ea95310be73ee60e55a0ce6864...,0.0,6.495442,8.065195,9.653446,2.007170,0.0,0.0,0.0,...,2.821582,0.0,0.000000,3.746993,0.000000,4.727379,0.000000,2.075920,0.0,0.0
1267,MANSABO,78514a632682d1c07ee4f782302bb6a74f2676f1a91b56...,0.0,12.675972,0.000000,27.582960,0.000000,0.0,0.0,0.0,...,0.948577,0.0,9.169173,17.898558,4.945037,0.000000,0.000000,0.000000,0.0,0.0


For training the model, we don't need the `hash` of the sample.

In [5]:
df = raw_df.drop(columns=["hash"])
df

Unnamed: 0,label,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,dim-08,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,TRICKBOT,0.0,7.417366,0.000000,13.861341,14.981039,0.0,0.0,0.0,0.0,...,6.086682,0.0,23.732056,22.293385,9.553692,19.502905,0.000000,0.000000,0.0,0.0
1,DARKKOMET,0.0,5.675191,0.000000,23.251524,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,8.913853,0.000000,0.000000,0.000000,3.610444,0.0,0.0
2,COBALTSTRIKE,0.0,8.435247,11.025072,1.910951,2.756680,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,1.928727,3.386528,14.165046,1.996442,0.000000,0.0,0.0
3,HIVE,0.0,7.161143,7.466067,0.000000,12.057423,0.0,0.0,0.0,0.0,...,2.785359,0.0,6.948788,0.000000,10.238041,7.976179,0.000000,0.685886,0.0,0.0
4,REMCOS,0.0,13.437019,6.879735,5.775038,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,10.821386,22.139019,0.000000,6.127213,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,NECONYD,0.0,6.819577,13.081622,3.350442,21.139126,0.0,0.0,0.0,0.0,...,9.952506,0.0,10.925339,2.663227,3.490541,3.282700,0.000000,0.000000,0.0,0.0
1265,DRIDEX,0.0,9.382783,4.581702,2.148845,13.126667,0.0,0.0,0.0,0.0,...,4.992890,0.0,22.065388,7.085610,7.298818,15.700541,0.000000,0.000000,0.0,0.0
1266,TRICKBOT,0.0,6.495442,8.065195,9.653446,2.007170,0.0,0.0,0.0,0.0,...,2.821582,0.0,0.000000,3.746993,0.000000,4.727379,0.000000,2.075920,0.0,0.0
1267,MANSABO,0.0,12.675972,0.000000,27.582960,0.000000,0.0,0.0,0.0,0.0,...,0.948577,0.0,9.169173,17.898558,4.945037,0.000000,0.000000,0.000000,0.0,0.0


Separate into `X` and `y`.

In [6]:
X = df.drop(columns=["label"])
y = df["label"]

In [7]:
print(X.shape)
print(y.shape)

(1269, 32)
(1269,)


# Label Encoding and Train-Test Split

We need to encode the labels for the `y`.

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

We want to save the label encoder for use in the full model.

In [9]:
import pickle

with open("../models/classifier/label-encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

80% of the data will be saved for training, while 20% will be left for testing.

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1015, 32)
(254, 32)
(1015,)
(254,)


# Model Creation

Nothing too special here, we just use several dense layers for classification.

In [12]:
from keras import layers
from keras.models import Sequential

In [13]:
INITIAL_EPOCH = 0

def create_model():
    model = Sequential(name="Classifier")

    model.add(keras.Input((X.shape[1],), name="input"))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(len(label_encoder.classes_), activation="softmax"))

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["sparse_categorical_accuracy"]
    )
    return model

In [14]:
model = create_model()
model.summary()

2024-04-30 14:01:22.668787: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:22.668900: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:22.668931: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:23.505430: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-30 14:01:23.505578: I external/local_xla/xla/stream_executor

Define callbacks.

In [15]:
import os

checkpoint_path = "../models/classifier/checkpoint.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

In [16]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=1, min_delta=1e-4)

Load latest checkpoint if there is one.

In [17]:
# INITIAL_EPOCH = 54
# model = keras.models.load_model(checkpoint_path)

Train the model.

In [18]:
NUM_EPOCHS = 100

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    initial_epoch=INITIAL_EPOCH,
    epochs=NUM_EPOCHS,
    callbacks=[checkpointer, early_stopping]
)

Epoch 1/100


I0000 00:00:1714456884.363605  664042 service.cc:145] XLA service 0x7f84e8004380 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1714456884.363640  664042 service.cc:153]   StreamExecutor device (0): Quadro P1000, Compute Capability 6.1
2024-04-30 14:01:24.383771: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-30 14:01:25.178909: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 1/26[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:07[0m 3s/step - loss: 5.2797 - sparse_categorical_accuracy: 0.0000e+00

I0000 00:00:1714456886.420316  664042 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 4.4855 - sparse_categorical_accuracy: 0.0853    
Epoch 1: val_loss improved from inf to 3.39868, saving model to ../models/classifier/checkpoint.keras
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 79ms/step - loss: 4.4683 - sparse_categorical_accuracy: 0.0885 - val_loss: 3.3987 - val_sparse_categorical_accuracy: 0.2562
Epoch 2/100
[1m 1/26[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 14ms/step - loss: 3.0107 - sparse_categorical_accuracy: 0.3438
Epoch 2: val_loss improved from 3.39868 to 2.71176, saving model to ../models/classifier/checkpoint.keras
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.9909 - sparse_categorical_accuracy: 0.3292 - val_loss: 2.7118 - val_sparse_categorical_accuracy: 0.3892
Epoch 3/100
[1m 1/26[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 14ms/step - loss: 2.4318 - sparse_categorical_accuracy: 0.4688
Epoch 3: val_loss improved

<keras.src.callbacks.history.History at 0x7f85b0092490>

Load the best performing model.

In [19]:
model = keras.models.load_model(checkpoint_path)

In [20]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Testing loss:      {test_loss:5.5f}")
print(f"Testing accuracy:  {test_acc:5.5f}")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step - loss: 1.7363 - sparse_categorical_accuracy: 0.6447
Testing loss:      1.66822
Testing accuracy:  0.66535


In [21]:
model.save("../models/classifier/classifier.keras")