# Data Encoder
Uses an autoencoder to reduce the dimensionality of the data.

In [1]:
import tensorflow as tf
print(f"GPU is {'not ' if len(tf.config.list_physical_devices('GPU')) == 0 else ''}available")

2024-05-06 10:04:49.832493: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is available


2024-05-06 10:04:51.125977: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:51.162574: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:51.162622: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
import keras

# Data Processing

The data should already be present as `dataset.csv` and `top_unigrams.txt` in the `data` folder.

If they are not present, do the following.
1. Ensure that the VirusTotal reports are present in `data/json` with the format `[LABEL]_[HASH].json`.
3. Run `prepare_data.py`. This will generate the two files needed for this notebook.

In [3]:
import pandas as pd

In [4]:
raw_df = pd.read_csv("../data/dataset.csv")
raw_df

Unnamed: 0,label,hash,dim-0000,dim-0001,dim-0002,dim-0003,dim-0004,dim-0005,dim-0006,dim-0007,...,dim-9990,dim-9991,dim-9992,dim-9993,dim-9994,dim-9995,dim-9996,dim-9997,dim-9998,dim-9999
0,TRICKBOT,f16631469eb35406ef4049d30c763cadda571b25bbdb45...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DARKKOMET,d31a7102cbc54447c251ba62760eb484fd0c9fbb8ea54f...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,SALITY,e7fc7de574f44a966b198b7625bd6c595cad05bd669619...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ADPOSHEL,fb576aea86528eaa082efbd073a7d4a6d1c2006da9ba49...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,VEBZENPAK,4519186b8fb2eaa847255087b44f918928c20e97c2fbea...,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,MANSABO,78514a632682d1c07ee4f782302bb6a74f2676f1a91b56...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4586,BENIGN,958cceb0f7f7ae76b2527744da7e2305a372aff304d372...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4587,TRICKBOT,46401903e85a5c457490a6934ec4dc61fdf28df83af377...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4588,TRICKBOT,7eca38a5d0098a7ca4baa1faca43b80b5f911b7580273b...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


For the training of the model, we don't need the label or the file hash.

In [5]:
df = raw_df.drop(columns=["label", "hash"])
df

Unnamed: 0,dim-0000,dim-0001,dim-0002,dim-0003,dim-0004,dim-0005,dim-0006,dim-0007,dim-0008,dim-0009,...,dim-9990,dim-9991,dim-9992,dim-9993,dim-9994,dim-9995,dim-9996,dim-9997,dim-9998,dim-9999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


80% of the dataframe will be saved for training, while 20% will be left for testing.

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(
    df, test_size=0.2, random_state=42
)

In [7]:
X_train.shape

(3672, 10000)

In [8]:
X_test.shape

(918, 10000)

# Model Creation

We will use an autoencoder to reduce the dimensionality of the data.

In [9]:
from keras import layers
from keras.models import Sequential

In [10]:
LAYER_SIZES = [512, 256, 128, 32]  # The last layer is the center layer

In [11]:
INITIAL_EPOCH = 0

def create_encoder():
    model = Sequential(name="Encoder")
    model.add(keras.Input((df.shape[1],), name="encoder-input"))

    for layer_size in LAYER_SIZES[:-1]:
        model.add(layers.Dense(layer_size, activation="relu"))

    # Add an activity regularizer to make the middle layer sparse
    model.add(layers.Dense(LAYER_SIZES[-1], activation="relu"))
    
    return model


def create_decoder():
    model = Sequential(name="Decoder")
    model.add(keras.Input((LAYER_SIZES[-1],), name="decoder-input"))

    for layer_size in LAYER_SIZES[-2::-1]:  # Starting from second last
        model.add(layers.Dense(layer_size, activation="relu"))
    model.add(layers.Dense(df.shape[1], activation="relu"))
    
    return model


def create_autoencoder(encoder, decoder):
    model = Sequential(name="Autoencoder")
    model.add(keras.Input((df.shape[1],), name="encoder-input"))
    model.add(encoder)
    model.add(decoder)

    model.compile(
        loss="mse",
        optimizer="adam",
        metrics=["mae"]
    )

    return model

In [12]:
encoder = create_encoder()
decoder = create_decoder()
autoencoder = create_autoencoder(encoder, decoder)

2024-05-06 10:04:55.379451: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:55.379536: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:55.379568: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:56.160487: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-06 10:04:56.160614: I external/local_xla/xla/stream_executor

In [13]:
autoencoder.summary(expand_nested=True)

Define callbacks.

In [14]:
import os

checkpoint_path = "../models/encoder/checkpoint.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

In [15]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=1, min_delta=1e-4)

Load latest checkpoint if there is one.

In [16]:
# INITIAL_EPOCH = 54
# autoencoder = keras.models.load_model(checkpoint_path)

Train the model.

In [17]:
NUM_EPOCHS = 200

autoencoder.fit(
    X_train,
    X_train,
    validation_split=0.2,
    initial_epoch=INITIAL_EPOCH,
    epochs=NUM_EPOCHS,
    callbacks=[checkpointer, early_stopping]
)

2024-05-06 10:04:56.853276: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 234960000 exceeds 10% of free system memory.
2024-05-06 10:04:57.323282: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 234960000 exceeds 10% of free system memory.
2024-05-06 10:04:57.432907: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 234960000 exceeds 10% of free system memory.
2024-05-06 10:04:57.525790: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 234960000 exceeds 10% of free system memory.


Epoch 1/200


I0000 00:00:1714961098.927779 1104625 service.cc:145] XLA service 0x7f946400f8c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1714961098.927903 1104625 service.cc:153]   StreamExecutor device (0): Quadro P1000, Compute Capability 6.1
2024-05-06 10:04:58.963701: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-06 10:04:59.848068: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902


[1m16/92[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 11ms/step - loss: 0.0179 - mae: 0.0256

I0000 00:00:1714961102.372248 1104625 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 0.0154 - mae: 0.0228
Epoch 1: val_loss improved from inf to 0.01015, saving model to ../models/encoder/checkpoint.keras
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 59ms/step - loss: 0.0154 - mae: 0.0227 - val_loss: 0.0102 - val_mae: 0.0146
Epoch 2/200
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 0.0097 - mae: 0.0138
Epoch 2: val_loss improved from 0.01015 to 0.00909, saving model to ../models/encoder/checkpoint.keras
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0097 - mae: 0.0138 - val_loss: 0.0091 - val_mae: 0.0130
Epoch 3/200
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - loss: 0.0087 - mae: 0.0122
Epoch 3: val_loss improved from 0.00909 to 0.00839, saving model to ../models/encoder/checkpoint.keras
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss:

<keras.src.callbacks.history.History at 0x7f9553e105e0>

Load the best performing model.

In [18]:
autoencoder = keras.models.load_model(checkpoint_path)

Evaluate the autoencoder on the test data.

In [19]:
test_mse, test_mae = autoencoder.evaluate(X_test, X_test, verbose=1)
print(f"Testing MSE: {test_mse:5.5f}")
print(f"Testing MAE: {test_mae:5.5f}")

2024-05-06 10:05:57.885964: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 73440000 exceeds 10% of free system memory.


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 0.0062 - mae: 0.0080
Testing MSE: 0.00619
Testing MAE: 0.00804


Get only the encoder part to save.

In [20]:
encoder = autoencoder.get_layer("Encoder")

In [21]:
encoder.summary()

In [22]:
encoder.save("../models/encoder/encoder.keras")

# Transforming Original Data

In [23]:
transformed_df = encoder.predict(df)

[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [24]:
transformed_df = pd.DataFrame(transformed_df, columns=[f"dim-{i:02d}" for i in range(LAYER_SIZES[-1])])

Add the labels and hashes back to the dataframe.

In [25]:
transformed_df.insert(0, "label", raw_df["label"])
transformed_df.insert(1, "hash", raw_df["hash"])

In [26]:
transformed_df

Unnamed: 0,label,hash,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,TRICKBOT,f16631469eb35406ef4049d30c763cadda571b25bbdb45...,0.234683,0.000000,1.112554,3.848537,10.626454,0.000000,8.636655,4.115376,...,11.081755,0.000044,0.000000,0.0,0.000000,0.000000,15.831595,0.000000,7.238762,0.0
1,DARKKOMET,d31a7102cbc54447c251ba62760eb484fd0c9fbb8ea54f...,0.000000,0.000000,5.961174,4.955536,7.219052,6.404585,0.000000,0.000000,...,0.000000,3.986464,15.176069,0.0,1.818527,0.000000,19.848658,0.000000,3.754366,0.0
2,SALITY,e7fc7de574f44a966b198b7625bd6c595cad05bd669619...,1.887301,0.000000,3.101377,0.000000,0.000000,0.000000,8.977000,2.093550,...,6.317175,0.000000,0.000000,0.0,0.000000,0.000000,13.234521,0.000000,0.323077,0.0
3,ADPOSHEL,fb576aea86528eaa082efbd073a7d4a6d1c2006da9ba49...,9.481628,0.000000,1.471548,13.774067,3.201977,0.000000,4.538937,2.335279,...,10.188432,0.000000,5.863317,0.0,0.437514,0.000000,4.933706,0.000000,17.694593,0.0
4,VEBZENPAK,4519186b8fb2eaa847255087b44f918928c20e97c2fbea...,0.000000,7.298491,0.000000,0.103305,8.573599,3.445668,8.807044,2.108075,...,6.378553,1.399800,1.832773,0.0,0.000000,0.000000,0.000000,2.220947,5.460258,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,MANSABO,78514a632682d1c07ee4f782302bb6a74f2676f1a91b56...,4.159583,0.000000,2.365226,16.249929,8.144193,12.965709,4.145925,0.000000,...,0.000000,0.000000,18.732872,0.0,3.190338,0.000000,8.085227,0.000000,0.000000,0.0
4586,BENIGN,958cceb0f7f7ae76b2527744da7e2305a372aff304d372...,0.505066,0.000000,1.048248,0.290613,0.000000,2.703252,3.075265,5.338703,...,4.729104,0.000000,0.000000,0.0,0.000000,0.171386,0.000000,1.841697,5.468739,0.0
4587,TRICKBOT,46401903e85a5c457490a6934ec4dc61fdf28df83af377...,0.000000,2.437977,2.851583,15.546938,13.855932,18.300507,5.672984,0.000000,...,0.000000,0.000000,16.283621,0.0,0.000000,0.000000,6.143718,0.000000,0.000000,0.0
4588,TRICKBOT,7eca38a5d0098a7ca4baa1faca43b80b5f911b7580273b...,0.000000,0.000000,0.000000,10.569523,6.198376,14.927482,9.734764,0.000000,...,4.672982,2.387461,12.033911,0.0,0.270081,0.000000,0.000000,0.000000,7.245555,0.0


In [27]:
transformed_df.to_csv("../data/encoded-data.csv", index=False)