In [3]:
import numpy as np
from tqdm import tqdm

In [4]:
def embed_bytes(byte):
    binary_string = "{0:08b}".format(byte)
    vec = np.zeros(8)
    for i in range(8):
        if binary_string[i] == "1":
            vec[i] = float(1) / 16
        else:
            vec[i] = -float(1) / 16
    return vec

In [5]:
byte1 = 255
byte2 = 1
print(embed_bytes(byte1))
print(embed_bytes(byte2))

[0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.0625]
[-0.0625 -0.0625 -0.0625 -0.0625 -0.0625 -0.0625 -0.0625  0.0625]


In [7]:
import os
from os import listdir

directories_with_labels = [("Benign PE Samples", 0), ("Malicious PE Samples", 1)]
list_of_samples = []
labels = []
for dataset_path, label in directories_with_labels:
    samples = [f for f in listdir(dataset_path)]
    for file in samples:
        file_path = os.path.join(dataset_path, file)
        list_of_samples.append(file_path)
        labels.append(label)

In [8]:
def read_file(file_path):
    """Read the binary sequence of a file."""
    with open(file_path, "rb") as binary_file:
        return binary_file.read()

In [9]:
max_size = 15000
num_samples = len(list_of_samples)
X = np.zeros((num_samples, 8, max_size))
Y = np.asarray(labels)
file_num = 0
for file in tqdm(list_of_samples):
    sample_byte_sequence = read_file(file)
    for i in range(min(max_size, len(sample_byte_sequence))):
        X[file_num, :, i] = embed_bytes(sample_byte_sequence[i])
    file_num += 1

100%|████████████████████████████████████████████████████████████████████████████████| 425/425 [00:56<00:00,  7.47it/s]


In [10]:
print(X.shape)

(425, 8, 15000)


In [11]:
from keras import optimizers

my_opt = optimizers.SGD(lr=0.01, decay=1e-5, nesterov=True)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
from keras import Input

inputs = Input(shape=(8, max_size))

In [13]:
from keras.layers import Conv1D

conv1 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding="same")(inputs)
conv2 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding="same")(inputs)

In [14]:
from keras.layers import Activation

a = Activation("sigmoid", name="sigmoid")(conv2)

In [15]:
from keras.layers import multiply

mul = multiply([conv1, a])

In [16]:
b = Activation("relu", name="relu")(mul)

In [17]:
from keras.layers import GlobalMaxPool1D

p = GlobalMaxPool1D()(b)

In [18]:
from keras.layers import Dense

d = Dense(16)(p)
predictions = Dense(1, activation="sigmoid")(d)

In [19]:
from keras import Model

model = Model(inputs=inputs, outputs=predictions)

In [20]:
model.compile(optimizer=my_opt, loss="binary_crossentropy", metrics=["acc"])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [21]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 8, 15000)     0                                            
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1, 32)        61440032    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1, 32)        61440032    input_1[0][0]                    
__________________________________________________________________________________________________
sigmoid (Activation)            (None, 1, 32)        0           conv1d_2[0][0]                   
____________________________________________________________________________________________

In [23]:
batch_size = 16
num_batches = int(num_samples / batch_size)

In [24]:
for batch_num in tqdm(range(num_batches)):
    batch = X[batch_num * batch_size : (batch_num + 1) * batch_size]
    model.train_on_batch(
        batch, Y[batch_num * batch_size : (batch_num + 1) * batch_size]
    )

  0%|                                                                                           | 0/26 [00:00<?, ?it/s]




100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [01:24<00:00,  3.27s/it]


In [25]:
Y_pred = model.predict(X)
print(Y_pred)

[[0.4511658 ]
 [0.27134025]
 [0.26911965]
 [0.25649643]
 [0.28519595]
 [0.4486405 ]
 [0.28179505]
 [0.2757051 ]
 [0.2843523 ]
 [0.44558662]
 [0.2865172 ]
 [0.23864758]
 [0.28393835]
 [0.26684868]
 [0.29177156]
 [0.22533855]
 [0.28047037]
 [0.28332072]
 [0.30784154]
 [0.27907914]
 [0.27613682]
 [0.2791333 ]
 [0.28917545]
 [0.27971035]
 [0.27471715]
 [0.22994727]
 [0.2636001 ]
 [0.4521477 ]
 [0.28292328]
 [0.26878202]
 [0.28235263]
 [0.25721285]
 [0.3029415 ]
 [0.300709  ]
 [0.27807942]
 [0.300312  ]
 [0.26479873]
 [0.26549792]
 [0.28137535]
 [0.28237006]
 [0.29211378]
 [0.27910313]
 [0.28099918]
 [0.18006435]
 [0.18146756]
 [0.1809124 ]
 [0.2440238 ]
 [0.2557292 ]
 [0.26992804]
 [0.25467953]
 [0.44621143]
 [0.2964239 ]
 [0.28523386]
 [0.2871138 ]
 [0.26943552]
 [0.2943311 ]
 [0.28806144]
 [0.29537147]
 [0.28608334]
 [0.2882347 ]
 [0.28758734]
 [0.1745924 ]
 [0.21580216]
 [0.2178691 ]
 [0.1847192 ]
 [0.29983872]
 [0.45729852]
 [0.2715067 ]
 [0.28741187]
 [0.23940021]
 [0.29651374]
 [0.28