# Data Encoder
Uses an autoencoder to reduce the dimensionality of the data.

In [1]:
import tensorflow as tf
print(f"GPU is {'not ' if len(tf.config.list_physical_devices('GPU')) == 0 else ''}available")

GPU is not available


In [2]:
import keras

# Data Processing

The data should already be present as `dataset.csv` and `top_unigrams.txt` in the `data` folder.

If they are not present, do the following.
1. Ensure that the VirusTotal reports are present in `data/json` with the format `[LABEL]_[HASH].json`.
3. Run `prepare_data.py`. This will generate the two files needed for this notebook.

In [3]:
import pandas as pd

In [4]:
raw_df = pd.read_csv("../data/dataset.csv")
raw_df

Unnamed: 0,label,hash,dim-0000,dim-0001,dim-0002,dim-0003,dim-0004,dim-0005,dim-0006,dim-0007,...,dim-9990,dim-9991,dim-9992,dim-9993,dim-9994,dim-9995,dim-9996,dim-9997,dim-9998,dim-9999
0,AAUTO,c35885d8463d1fe937cf5afb628e5f37ac2c33004b90da...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AENJARIS,6eba466355df18050554910e3aece28ac7118d6f9683a2...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AENJARIS,a463230d154886983071433608b97630644aeb46fd2a6e...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AGENTB,0022508fd02bb23c3a2c4f5de0906df506a2fcabc3e841...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AGENTB,08174ddc79fd17dee63232b6aa50c79b96d512546aead8...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,ZAPCHAST,04d6ff264286ef70cdc08e69cebc09a6cfd2752e5ba3a1...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
981,ZAPCHAST,a368fa01248ecb84c56c87fe65edde7f7d3730e1a37e87...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
982,ZAPCHAST,dfce96433887553201a295b1475373a82ac6a730f18d88...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
983,ZEGOST,1367ecca54ac27ce18179d6bfcc0ff93bb7cfb2882dc60...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


For the training of the model, we don't need the label or the file hash.

In [5]:
df = raw_df.drop(columns=["label", "hash"])
df

Unnamed: 0,dim-0000,dim-0001,dim-0002,dim-0003,dim-0004,dim-0005,dim-0006,dim-0007,dim-0008,dim-0009,...,dim-9990,dim-9991,dim-9992,dim-9993,dim-9994,dim-9995,dim-9996,dim-9997,dim-9998,dim-9999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
982,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


80% of the dataframe will be saved for training, while 20% will be left for testing.

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(
    df, test_size=0.2, random_state=42
)

In [7]:
X_train.shape

(788, 10000)

In [8]:
X_test.shape

(197, 10000)

# Model Creation

We will use an autoencoder to reduce the dimensionality of the data.

In [9]:
from keras import layers
from keras.models import Sequential

In [10]:
LAYER_SIZES = [2048, 512, 128, 32]  # The last layer is the center layer

In [11]:
INITIAL_EPOCH = 0

def create_encoder():
    model = Sequential(name="Encoder")
    model.add(keras.Input((df.shape[1],), name="encoder-input"))

    for layer_size in LAYER_SIZES:
        model.add(layers.Dense(layer_size, activation="relu"))

    return model


def create_decoder():
    model = Sequential(name="Decoder")
    model.add(keras.Input((LAYER_SIZES[-1],), name="decoder-input"))

    for layer_size in LAYER_SIZES[-2::-1]:  # Starting from second last
        model.add(layers.Dense(layer_size, activation="relu"))
    model.add(layers.Dense(df.shape[1], activation="relu"))
    
    return model


def create_autoencoder(encoder, decoder):
    model = Sequential(name="Autoencoder")
    model.add(keras.Input((df.shape[1],), name="encoder-input"))
    model.add(encoder)
    model.add(decoder)

    model.compile(
        loss="mse",
        optimizer="adam",
        metrics=["mae"]
    )

    return model

In [12]:
encoder = create_encoder()
decoder = create_decoder()
autoencoder = create_autoencoder(encoder, decoder)

In [13]:
autoencoder.summary(expand_nested=True)

Define callbacks.

In [14]:
import os

checkpoint_path = "../models/encoder/checkpoints/{epoch:04d}.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

In [15]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, verbose=1, min_delta=1e-4)

Define some utility functions that help identify the best model trained so far.

In [16]:
def find_latest_model(checkpoint_dir):
    os.makedirs(checkpoint_dir, exist_ok=True)
    dir_contents = os.listdir(checkpoint_dir)
    if len(dir_contents) == 0:
        return None
    latest_model_path = sorted(dir_contents)[-1]
    return latest_model_path


def load_latest_model(checkpoint_dir):
    latest_model_path = find_latest_model(checkpoint_dir)
    if latest_model_path is None:
        print("No checkpoints found, not loading anything")
        return None
    print(f"Loading '{latest_model_path}'")
    model = keras.models.load_model(os.path.join(checkpoint_dir, latest_model_path))
    print("Done")
    return model

Load latest checkpoint if there is one.

In [17]:
# INITIAL_EPOCH = 17
# autoencoder = load_latest_model(checkpoint_dir)

Train the model.

In [18]:
NUM_EPOCHS = 200

autoencoder.fit(
    X_train,
    X_train,
    validation_split=0.2,
    initial_epoch=INITIAL_EPOCH,
    epochs=NUM_EPOCHS,
    callbacks=[checkpointer, early_stopping]
)

Epoch 1/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - loss: 0.0178 - mae: 0.0264
Epoch 1: val_loss improved from inf to 0.01559, saving model to ../models/encoder/checkpoints/0001.keras
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 307ms/step - loss: 0.0178 - mae: 0.0263 - val_loss: 0.0156 - val_mae: 0.0222
Epoch 2/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step - loss: 0.0144 - mae: 0.0210
Epoch 2: val_loss improved from 0.01559 to 0.01364, saving model to ../models/encoder/checkpoints/0002.keras
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 295ms/step - loss: 0.0144 - mae: 0.0209 - val_loss: 0.0136 - val_mae: 0.0192
Epoch 3/200
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - loss: 0.0134 - mae: 0.0193
Epoch 3: val_loss improved from 0.01364 to 0.01249, saving model to ../models/encoder/checkpoints/0003.keras
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x15f305c0250>

Load the best performing model.

In [19]:
autoencoder = load_latest_model(checkpoint_dir)

Loading '0027.keras'
Done


Evaluate the autoencoder on the test data.

In [20]:
test_mse, test_mae = autoencoder.evaluate(X_test, X_test, verbose=1)
print(f"Testing MSE: {test_mse:5.5f}")
print(f"Testing MAE: {test_mae:5.5f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0080 - mae: 0.0106
Testing MSE: 0.00870
Testing MAE: 0.01147


Get only the encoder part to save.

In [21]:
encoder = autoencoder.get_layer("Encoder")

In [22]:
encoder.summary()

In [23]:
encoder.save("../models/encoder/encoder.keras")

# Transforming Original Data

In [24]:
transformed_df = encoder.predict(df)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [25]:
transformed_df = pd.DataFrame(transformed_df, columns=[f"dim-{i:02d}" for i in range(LAYER_SIZES[-1])])

Add the labels and hashes back to the dataframe.

In [26]:
transformed_df.insert(0, "label", raw_df["label"])
transformed_df.insert(1, "hash", raw_df["hash"])

In [27]:
transformed_df

Unnamed: 0,label,hash,dim-00,dim-01,dim-02,dim-03,dim-04,dim-05,dim-06,dim-07,...,dim-22,dim-23,dim-24,dim-25,dim-26,dim-27,dim-28,dim-29,dim-30,dim-31
0,AAUTO,c35885d8463d1fe937cf5afb628e5f37ac2c33004b90da...,0.0,0.000000,0.0,0.0,3.143692,0.0,2.765709,0.236933,...,0.0,2.409686,0.0,5.966980,0.000000,3.192779,0.000000,0.0,9.516870,14.959422
1,AENJARIS,6eba466355df18050554910e3aece28ac7118d6f9683a2...,0.0,4.038340,0.0,0.0,6.892432,0.0,9.448946,0.000000,...,0.0,5.842356,0.0,0.000000,0.000000,0.000000,0.000000,0.0,4.997760,21.368963
2,AENJARIS,a463230d154886983071433608b97630644aeb46fd2a6e...,0.0,4.073854,0.0,0.0,6.920461,0.0,9.223214,0.000000,...,0.0,5.814644,0.0,0.000000,0.000000,0.000000,0.000000,0.0,5.189191,21.501703
3,AGENTB,0022508fd02bb23c3a2c4f5de0906df506a2fcabc3e841...,0.0,0.995746,0.0,0.0,9.711827,0.0,4.073470,0.000000,...,0.0,0.000000,0.0,11.456560,0.000000,3.972807,1.380519,0.0,12.752404,8.822714
4,AGENTB,08174ddc79fd17dee63232b6aa50c79b96d512546aead8...,0.0,0.000000,0.0,0.0,0.000000,0.0,3.594600,2.413564,...,0.0,0.423059,0.0,4.166528,0.000000,10.761652,0.000000,0.0,8.941624,4.914284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,ZAPCHAST,04d6ff264286ef70cdc08e69cebc09a6cfd2752e5ba3a1...,0.0,0.000000,0.0,0.0,11.641270,0.0,19.596598,0.000000,...,0.0,2.564873,0.0,0.064231,0.000000,8.608975,0.000000,0.0,10.842281,12.096865
981,ZAPCHAST,a368fa01248ecb84c56c87fe65edde7f7d3730e1a37e87...,0.0,0.445765,0.0,0.0,4.585519,0.0,2.775109,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.548354,1.283706,0.0,12.336817,2.790969
982,ZAPCHAST,dfce96433887553201a295b1475373a82ac6a730f18d88...,0.0,0.000000,0.0,0.0,1.642815,0.0,15.560945,0.000000,...,0.0,9.013543,0.0,0.000000,0.000000,17.212420,0.000000,0.0,0.000000,0.000000
983,ZEGOST,1367ecca54ac27ce18179d6bfcc0ff93bb7cfb2882dc60...,0.0,7.844924,0.0,0.0,3.715586,0.0,2.872483,0.000000,...,0.0,12.256675,0.0,0.000000,3.586718,13.645184,0.000000,0.0,10.985634,2.622552


In [28]:
transformed_df.to_csv("../data/encoded-data.csv", index=False)