<a href="https://colab.research.google.com/github/OctoberFall/SoK-Security/blob/main/PDF_malware_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a multi-layer perceptron classifier to classify PDF samples into benign and malicious

The black box PDF classifier is used to evaluate explanations on a test PDF sample.

Reference: Evaluating Explanation Methods for Deep Learning in Security, A.Warnecke, D.Arp, C. Wressnegger and K.Rieck, IEEE European Symposium on Security and Privacy (Euro S&P), 2020. [Github](https://github.com/alewarne/explain-mlsec)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

In [None]:

path_to_csv="data/contagio-all.csv"
non_relevant_columns = [1]  #filename is not relevant to classifier
label_column = 0
arr = np.genfromtxt(path_to_csv, dtype=str, delimiter=',', skip_header=0)
filenames = arr[1:, 1]
no_features = arr.shape[1]
no_features

137

In [None]:
columns_to_use = [i for i in range(no_features) if i not in non_relevant_columns]
arr = np.genfromtxt(path_to_csv, dtype=np.float, delimiter=',', skip_header=1, usecols=columns_to_use)
labels = arr[:, label_column]
labels = np.array([[1,0] if l == 0 else [0,1] for l in labels])
data = np.delete(arr, 0, axis=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  arr = np.genfromtxt(path_to_csv, dtype=np.float, delimiter=',', skip_header=1, usecols=columns_to_use)


In [None]:
random_seed = 123456
vec_output = True
loss = 'binary_crossentropy'
binary_encoding = True

In [None]:
if binary_encoding:
    data[np.where(data != 0)] = 1
else:
    data = normalize(data, 'max', axis=0)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=random_seed)
_, filenames_test = train_test_split(filenames, test_size=0.25, random_state=random_seed)


## Add a MLP network

In [None]:
import sys
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from custom_metrics import custom_true_positive_metric, custom_false_positive_metric
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
no_features=x_train.shape[1]
vec_output = True
final_nonlinearity = 'softmax'
optimizer = 'adam'
loss = 'binary_crossentropy'
epochs = 100
batch_size = 32

In [None]:
model = keras.Sequential()
model.add(Dense(units=200, activation='relu', input_shape=(no_features, )))
model.add(Dropout(rate=0.5))
model.add(Dense(units=200, activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(units=2, activation='softmax'))
model.compile(optimizer, loss, metrics=['accuracy'])
model.fit(x_train, y_train, batch_size, epochs, validation_data=(x_test, y_test), verbose=2)

Epoch 1/100
235/235 - 1s - loss: 0.1434 - accuracy: 0.9552 - val_loss: 0.0332 - val_accuracy: 0.9876 - 810ms/epoch - 3ms/step
Epoch 2/100
235/235 - 0s - loss: 0.0481 - accuracy: 0.9867 - val_loss: 0.0339 - val_accuracy: 0.9868 - 335ms/epoch - 1ms/step
Epoch 3/100
235/235 - 0s - loss: 0.0404 - accuracy: 0.9888 - val_loss: 0.0239 - val_accuracy: 0.9928 - 311ms/epoch - 1ms/step
Epoch 4/100
235/235 - 0s - loss: 0.0326 - accuracy: 0.9907 - val_loss: 0.0396 - val_accuracy: 0.9868 - 345ms/epoch - 1ms/step
Epoch 5/100
235/235 - 0s - loss: 0.0331 - accuracy: 0.9888 - val_loss: 0.0259 - val_accuracy: 0.9884 - 361ms/epoch - 2ms/step
Epoch 6/100
235/235 - 0s - loss: 0.0315 - accuracy: 0.9900 - val_loss: 0.0192 - val_accuracy: 0.9944 - 327ms/epoch - 1ms/step
Epoch 7/100
235/235 - 0s - loss: 0.0291 - accuracy: 0.9911 - val_loss: 0.0301 - val_accuracy: 0.9920 - 367ms/epoch - 2ms/step
Epoch 8/100
235/235 - 0s - loss: 0.0256 - accuracy: 0.9917 - val_loss: 0.0178 - val_accuracy: 0.9944 - 362ms/epoch - 2

Epoch 66/100
235/235 - 0s - loss: 0.0089 - accuracy: 0.9971 - val_loss: 0.0126 - val_accuracy: 0.9972 - 323ms/epoch - 1ms/step
Epoch 67/100
235/235 - 0s - loss: 0.0052 - accuracy: 0.9985 - val_loss: 0.0112 - val_accuracy: 0.9980 - 321ms/epoch - 1ms/step
Epoch 68/100
235/235 - 0s - loss: 0.0082 - accuracy: 0.9972 - val_loss: 0.0153 - val_accuracy: 0.9960 - 358ms/epoch - 2ms/step
Epoch 69/100
235/235 - 0s - loss: 0.0108 - accuracy: 0.9968 - val_loss: 0.0149 - val_accuracy: 0.9952 - 370ms/epoch - 2ms/step
Epoch 70/100
235/235 - 0s - loss: 0.0083 - accuracy: 0.9969 - val_loss: 0.0128 - val_accuracy: 0.9984 - 322ms/epoch - 1ms/step
Epoch 71/100
235/235 - 0s - loss: 0.0091 - accuracy: 0.9967 - val_loss: 0.0108 - val_accuracy: 0.9980 - 332ms/epoch - 1ms/step
Epoch 72/100
235/235 - 0s - loss: 0.0066 - accuracy: 0.9975 - val_loss: 0.0127 - val_accuracy: 0.9960 - 361ms/epoch - 2ms/step
Epoch 73/100
235/235 - 0s - loss: 0.0089 - accuracy: 0.9968 - val_loss: 0.0112 - val_accuracy: 0.9980 - 399ms/e

<keras.callbacks.History at 0x7fb0536b2370>

In [None]:
model.save(filepath="models/keras_model.h5")

In [None]:
# prints accuracy, precision, recall, fpr and f1 score for given model and test set with labels
def get_statistics(model, x_test, y_test):
    y_pred = np.argmax(model.predict(x_test), axis=1)
    y_test = np.argmax(y_test, axis=1)
    assert len(y_pred) == len(y_test)
    acc = np.sum(y_pred==y_test)/np.float(len(y_pred))
    cm = confusion_matrix(y_test, y_pred)
    TN, FN, TP, FP = cm[0,0], cm[1,0], cm[1,1], cm[0,1]
    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)
    precision = TP/(TP+FP)
    F1 = 2*TP/(2*TP+FP+FN)
    print('The model achieved: Accuracy:{}, Precision:{}, Recall:{}, FPR:{}, F1 score:{} on the test set.'.format(
        acc, precision, TPR, FPR, F1))

In [None]:
get_statistics(model, x_test, y_test)

The model achieved: Accuracy:0.998, Precision:0.9991680532445923, Recall:0.9966804979253112, FPR:0.0007722007722007722, F1 score:0.9979227253842958 on the test set.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  acc = np.sum(y_pred==y_test)/np.float(len(y_pred))
