# High Level Feature Extraction
CNN/LSTM Model  
Train on original data, 50% malware, 50% benign.

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, MaxPooling1D,LeakyReLU, MaxPool1D, Dropout, LSTM, Embedding, Dense
from keras.optimizers import Adam
from keras import optimizers

In [2]:
url12000='https://www.dropbox.com/s/zuk6f9ax1hupb5u/finalshuf12000.csv?dl=1'
dataset = pd.read_csv(url12000, sep=';',on_bad_lines='skip', header = None)
sort = dataset.sort_values(dataset.columns[-1], ascending = False)
malwares = sort.head(6000).head(3000)
benigns = (sort.tail(6000))#.head(3000)
# benigns_extra = (sort.tail(6000)).tail(3000)
dataset = malwares.append(benigns, ignore_index=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset = np.array(dataset)
X_train,Y_train = np.delete(dataset,-1,axis=1),dataset[:,-1]
print('X_train.shape is:',X_train.shape)
print('Y_train.shape is:',Y_train.shape)
Y_train = Y_train.astype(int)
Y_train = pd.get_dummies(Y_train).to_numpy()
Y_train[0]

X_train.shape is: (9000, 308)
Y_train.shape is: (9000,)


array([0, 1], dtype=uint8)

In [3]:
def create_CNN_model(x_train,y_train):
    x_train = x_train.reshape(X_train.shape[0], X_train.shape[1])
    x_train = x_train.astype(int)
    model = Sequential(name="CNN_model")
    model.add(Embedding(input_dim=256, output_dim=64, input_length=X_train.shape[1]))
    model.add(Conv1D(filters=128, kernel_size=5, strides = 1, padding='valid', activation='relu'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Dropout(0.3))
    model.add(Conv1D(filters=64, kernel_size=3, strides = 1, padding='valid', activation='relu'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Dropout(0.3))
    model.add(LeakyReLU(alpha=(0.1)))
    model.add(Flatten())  # Flatten
    model.add(Dense(100, activation='relu'))  # Output Layer
    model.add(Dense(2, activation='softmax'))  # Output layer
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=.001), metrics=['accuracy'])
    history = model.fit(x_train,y_train,batch_size=80,epochs=50,validation_split=0.2)
    return model, history
CNN_model, history = create_CNN_model(X_train,Y_train)

Model: "CNN_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 308, 64)           16384     
                                                                 
 conv1d (Conv1D)             (None, 304, 128)          41088     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 101, 128)         0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 101, 128)          0         
                                                                 
 conv1d_1 (Conv1D)           (None, 99, 64)            24640     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 33, 64)           0         
 1D)                                                     

InvalidArgumentError: ignored

In [None]:
#Removes final classification layer and calls predict to get length 32 feature vectors for the GAN
CNN_model.pop()
CNN_model.summary()

Use pretrained model to extract 32 high level features for only malware, and pass them to GAN. Also extract features for benign files we will use to oversample beside the generated malware.

In [None]:
urlMalware='https://www.dropbox.com/s/701tpt672z5paao/headers328overfit.csv?dl=1' # Only malware
dataset = pd.read_csv(urlMalware, sep=';',on_bad_lines='skip', header = None)
dataset = dataset.iloc[: , :-21]
dataset[len(dataset.columns)] = 1.0
dataset = np.array(dataset)
X_train,Y_train = np.delete(dataset,-1,axis=1),dataset[:,-1]
malware_features = CNN_model.predict(X_train,batch_size=80)
malware_features.shape

In [None]:
# benigns_extra = np.array(benigns_extra)
# X_train,Y_train = np.delete(benigns_extra,-1,axis=1),benigns_extra[:,-1]
# new_benign_features = CNN_model.predict(X_train,batch_size=80)
# new_benign_features = np.c_[new_benign_features, Y_train]
# np.savetxt("new_benigns.csv", new_benign_features, delimiter=";")

# Boundary Seeking GAN

In [None]:
from __future__ import print_function, division
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D, LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
import keras.backend as K

import matplotlib.pyplot as plt

import sys

import numpy as np

class BGAN():
    def __init__(self):
        self.img_shape = 32
        self.latent_dim = 10
        optimizer = Adam(0.0002, 0.5)
        self.discriminator = self.build_discriminator()
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=0.1)
        self.discriminator.compile(loss=loss,optimizer=optimizer,metrics=['accuracy'])
        self.generator = self.build_generator()
        # The generator takes noise as input and generated samples
        z = Input(shape=(self.latent_dim,))
        generated_sample = self.generator(z)
        # For the combined model we will only train the generator
        self.discriminator.trainable = False
        # The valid takes generated images as input and determines validity
        valid = self.discriminator(generated_sample)
        # The combined model  (stacked generator and discriminator)
        # Trains the generator to fool the discriminator
        self.combined = Model(z, valid)
        self.combined.compile(loss=self.boundary_loss, optimizer=optimizer)

    def build_generator(self):
        model = Sequential(name="Generator")
        model.add(Dense(256, input_dim=self.latent_dim))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dropout(0.2))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dropout(0.2))
        model.add(Dense(1024))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dropout(0.2))
        model.add(Dense(np.prod(self.img_shape), activation='tanh'))

        model.summary()
        noise = Input(shape=(self.latent_dim,))
        sample = model(noise)
        return Model(noise, sample)

    def build_discriminator(self):
        model = Sequential(name="Discriminator")
        model.add(Dense(512, input_dim=32))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.2))
        model.add(Dense(256))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.2))
        model.add(Dense(128))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.2))
        model.add(Dense(64))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))

        model.summary()
        img = Input(shape=self.img_shape)
        validity = model(img)
        return Model(img, validity)

    def boundary_loss(self, y_true, y_pred):
        return 0.5 * K.mean((K.log(y_pred) - K.log(1 - y_pred))**2)

    def train(self, X_train, epochs, batch_size=80):
        # Adversarial ground truths
        valid = np.full((batch_size, 1), 0.9)
        fake = np.zeros((batch_size, 1))
        for epoch in range(epochs):
            # ---------------------
            #  Train Discriminator
            # ---------------------
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            imgs = X_train[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            gen_imgs = self.generator.predict(noise)
            d_loss_real = self.discriminator.train_on_batch(imgs, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_imgs, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------
            g_loss = self.combined.train_on_batch(noise, valid)
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

    def predict(self, samples, batch_size=80):
        noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
        gen_imgs = self.generator.predict(noise)
        for i in range(samples-1):
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            gen_imgs = np.append(gen_imgs, self.generator.predict(noise), axis=0)
            print("%d/%d" % (i+2, samples))
        return gen_imgs

In [None]:
bgan = BGAN()
bgan.train(malware_features, epochs=120, batch_size=80)
fake_malware_samples = bgan.predict(75,  batch_size=80) #75 * 80 new samples = 6000

In [None]:
fake_malware_samples.shape

In [None]:
print(fake_malware_samples[0])
print(malware_features[0])

In [None]:
fake_malware_samples = np.c_[fake_malware_samples, np.ones(fake_malware_samples.shape[0])]
fake_malware_samples.shape

In [None]:
np.savetxt("fake_malware_samples.csv", fake_malware_samples, delimiter=";")

# Data Processing
Get new unique 3000 benign samples to oversample the benign files as well.  
Make sure they aren't present in the original 6000 dataset.

In [None]:
# url12000='https://www.dropbox.com/s/wjxpmvduekwqr5i/dataset12000.csv?dl=1'
# dataset = pd.read_csv(url12000, sep=';',on_bad_lines='skip', header = None)
# dataset2 = pd.read_csv(url6000, sep=';',on_bad_lines='skip', header = None)
# dataset = dataset.drop(dataset[dataset.iloc[:,-1] == 0].index)
# dataset2.info()

In [None]:
# combined = dataset.merge(dataset2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
# combined = (combined.drop(['_merge'], axis=1)).head(3000)
# combined.info()

In [None]:
# new_benign = np.array(combined)
# X_train,Y_train = np.delete(new_benign,-1,axis=1),new_benign[:,-1]
# new_benign_features = CNN_model.predict(X_train,batch_size=80)
# new_benign_features.shape

In [None]:
# new_benign_features = np.c_[new_benign_features, np.zeros(new_benign_features.shape[0])]

Convert original 6000 dataset to extracted features. 
Will classify them alone, then oversample with the new_samples and new_benign for comparison.

In [None]:
# dataset = pd.read_csv(url12000, sep=';',on_bad_lines='skip', header = None)
# dataset = np.array(dataset)
dataset = malwares.append(benigns, ignore_index=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset = np.array(dataset)
X_train,Y_train = np.delete(dataset,-1,axis=1),dataset[:,-1]
orig_data_features = CNN_model.predict(X_train,batch_size=80)
orig_data_features.shape

In [None]:
#Add back label
orig_data_features = np.c_[orig_data_features, Y_train]

In [None]:
np.savetxt("orig_dataset.csv", orig_data_features, delimiter=";")

# Classification model

### Training
With original dataset

In [None]:
from keras.layers import *
from keras.models import *
from keras import backend as K
# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,return_sequences=True,**kwargs):
      self.return_sequences = return_sequences
      super(attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        #super(attention, self).build(input_shape)
 
    def call(self,x):
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [None]:
dataset = pd.read_csv('orig_dataset.csv', sep=';',on_bad_lines='skip', header = None)
dataset = np.array(dataset)
features,labels = np.delete(dataset,-1,axis=1),dataset[:,-1]
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.60, shuffle=True)
Y_train = pd.get_dummies(Y_train).to_numpy()
Y_test = pd.get_dummies(Y_test).to_numpy()
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)
X_train.shape

In [None]:
(2, X_train.shape[1])

In [None]:
def create_classification_model(x_train,y_train, batch_size = 80):
    x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
    model = Sequential()
    model.add(Conv1D(filters=128, kernel_size=5, strides = 1, padding='valid', activation='relu', input_shape=(x_train.shape[1], 1)))
    model.add(MaxPool1D(pool_size=3))
    model.add(Dropout(0.3))
    model.add(Conv1D(filters=64, kernel_size=3, strides = 1, padding='valid', activation='relu'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(256, activation='tanh', return_sequences=True,dropout=0.2)))
    model.add(attention(return_sequences=True))
    model.add(Flatten())  # Flatten
    model.add(Dense(100, activation='relu'))  # F6
    model.add(Dense(2, activation='softmax'))  # Output layer
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=.001), metrics=['accuracy'])
    history = model.fit(x_train,y_train,batch_size=batch_size,epochs=50,validation_split=0.2)
    return model, history
classification_model, train_result = create_classification_model(X_train,Y_train)

With oversampling. 

In [None]:
dataset = pd.read_csv('orig_dataset.csv', sep=';',on_bad_lines='skip', header = None)
dataset2 = pd.read_csv('fake_malware_samples.csv', sep=';',on_bad_lines='skip', header = None)
# dataset3 = pd.read_csv('new_benigns.csv', sep=';',on_bad_lines='skip', header = None)
print(dataset.shape)
print(dataset2.shape)
# print(dataset3.shape)

In [None]:
# frames = [dataset, dataset2.sample(1500)] 
# frames = [dataset, dataset2.sample(3000)]
frames = [dataset, dataset2.sample(3000)] #, dataset3] # Biggest oversample of malware works the best
combined_data = pd.concat(frames)
combined_data = combined_data.sample(frac=1).reset_index(drop=True)
combined_data.shape

In [None]:
combined_data = np.array(combined_data)
features, labels = np.delete(combined_data,-1,axis=1),combined_data[:,-1]
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(features, labels, test_size=0.60, shuffle=True)
Y_train2 = pd.get_dummies(Y_train2).to_numpy()
Y_test2 = pd.get_dummies(Y_test2).to_numpy()
Y_train2 = Y_train2.astype(int)
Y_test2 = Y_test2.astype(int)
X_train2.shape

In [None]:
classification_model2, train_result2 = create_classification_model(X_train2,Y_train2)

In [None]:
def plot_training_summary(history):
  #Plotting the training and validationaccuracy
  plt.figure(figsize = (8,8))
  plt.plot(history.history['accuracy'], color = 'blue', label = 'train')
  plt.plot(history.history['val_accuracy'], color = 'red', label = 'val')
  plt.legend()
  plt.title('Accuracy')
  plt.show()
  
  #Plotting the training and validation Loss
  plt.figure(figsize = (8,8))
  plt.plot(history.history['loss'], color = 'blue', label = 'train')
  plt.plot(history.history['val_loss'], color = 'red', label = 'val')
  plt.legend()
  plt.title('Loss')
  plt.show()

# Plotting the training and validation loss and accuracy
plot_training_summary(train_result)
plot_training_summary(train_result2)

In [None]:
print(f"Original accuracy:", train_result.history['accuracy'][-1])
print(f"Original validation accuracy:", train_result.history['val_accuracy'][-1])
print(f"Oversampled accuracy:", train_result2.history['accuracy'][-1])
print(f"Oversampled validation accuracy:", train_result2.history['val_accuracy'][-1])

### Testing

In [None]:
# evaluating the model
from sklearn.metrics import confusion_matrix
import seaborn as sns
def model_eval(model, test_x, test_y, title):
  loss,accuracy=model.evaluate(test_x, test_y, verbose=2)
  print('Accuracy of the model is: ', accuracy*100)
  predicted_y=model.predict(test_x)
  predicted_y=np.argmax(predicted_y,axis=1)
  predicted_y=np.asarray(predicted_y)
  test_y=np.argmax(test_y,axis=1)

  #Plotting Confusion Matrix
  conf_matrix = confusion_matrix(test_y, predicted_y)
  ax = plt.subplot()
  sns.heatmap(conf_matrix, linewidths = 0.1, cmap = 'Blues', linecolor = 'gray', fmt = '.1f', annot = True, ax=ax)
  ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
  ax.set_title('Confusion Matrix\n' + title); 
  ax.xaxis.set_ticklabels(['Benign', 'Malware']); 
  ax.yaxis.set_ticklabels(['Benign', 'Malware']);
  plt.figure(figsize=(8,6))
  return accuracy, conf_matrix

In [None]:
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_test2 = X_test2.reshape(X_test2.shape[0], X_test2.shape[1], 1)
accuracy_1, conf_matrix1 = model_eval(classification_model,X_test,Y_test, 'Model-6')
accuracy_1, conf_matrix1 = model_eval(classification_model,X_test,Y_test, 'Model-6')
accuracy_2, conf_matrix2 = model_eval(classification_model2,X_test2,Y_test2, 'Model-6 Oversampled')

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Without Oversampling', 'With Oversampling']
val_acc = [accuracy_1*100, accuracy_2*100]
pps = ax.bar(langs,val_acc)
for p in pps:
   height = p.get_height()
   ax.annotate('{:.3f}%'.format(height), xy=(p.get_x() + p.get_width() / 2, height), xytext=(0, -20), textcoords="offset points", ha='center', va='bottom')
plt.ylim(98,100)
plt.show()

In [None]:
print(f"Original test accuracy:", accuracy_1)
print(f"Oversampled test accuracy:", accuracy_2)

In [None]:
TN1, FP1, FN1, TP1 = conf_matrix1[0][0], conf_matrix1[0][1], conf_matrix1[1][0], conf_matrix1[1][1]
TN2, FP2, FN2, TP2 = conf_matrix2[0][0], conf_matrix2[0][1], conf_matrix2[1][0], conf_matrix2[1][1]
precision1, precision2 = (TP1/(TP1+FP1))*100, (TP2/(TP2+FP2))*100
recall1, recall2 = (TP1/(TP1+FN1))*100, (TP2/(TP2+FN2))*100

In [None]:
model_1_acc, model_2_acc, model_3_acc, model_1_pre, model_2_pre, model_3_pre, model_1_rec, model_2_rec, model_3_rec = (99.03333187103271,
 99.13333058357239,
 99.23333525657654,
 98.40213049267643,
 98.59906604402934,
 99.32432432432432,
 99.66284558327713,
 99.66284558327713,
 99.12339851652057)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Model-1','Model-2', 'Model-3', 'Model-6', 'Model-6\nOversampled']
test_acc = [model_1_acc, model_2_acc, model_3_acc, accuracy_1*100, accuracy_2*100]
bar=ax.bar(langs,test_acc)
for p in bar:
   height = p.get_height()
   ax.annotate('{:.3f}%'.format(height),
      xy=(p.get_x() + p.get_width() / 2, height),
      xytext=(0, -20), # 3 points vertical offset
      textcoords="offset points",
      ha='center', va='bottom')
bar[0].set_color('r')
bar[1].set_color('g')
bar[3].set_color('c')
bar[4].set_color('m')
plt.ylim(98,100)
plt.title('Test Accuracy')
plt.show()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Model-1','Model-2', 'Model-3', 'Model-6', 'Model-6\nOversampled']
test_pre = [model_1_pre, model_2_pre, model_3_pre, precision1, precision2]
bar=ax.bar(langs,test_pre)
for p in bar:
   height = p.get_height()
   ax.annotate('{:.3f}%'.format(height),
      xy=(p.get_x() + p.get_width() / 2, height),
      xytext=(0, -20),
      textcoords="offset points",
      ha='center', va='bottom')
bar[0].set_color('r')
bar[1].set_color('g')
bar[3].set_color('c')
bar[4].set_color('m')
plt.ylim(98,100)
plt.title('Precision')
plt.show()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Model-1','Model-2', 'Model-3', 'Model-6', 'Model-6\nOversampled']
test_rec = [model_1_rec, model_2_rec, model_3_rec, recall1, recall2]
bar=ax.bar(langs,test_rec)
for p in bar:
   height = p.get_height()
   ax.annotate('{:.3f}%'.format(height),
      xy=(p.get_x() + p.get_width() / 2, height),
      xytext=(0, -20),
      textcoords="offset points",
      ha='center', va='bottom')
bar[0].set_color('r')
bar[1].set_color('g')
bar[3].set_color('c')
bar[4].set_color('m')
plt.ylim(98,100)
plt.title('Recall')
plt.show()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Model-1','Model-2', 'Model-3', 'Model-6', 'Model-6\nOversampled']
test_F1 = [2*(model_1_pre*model_1_rec)/(model_1_pre+model_1_rec), 2*(model_2_pre*model_2_rec)/(model_2_pre+model_2_rec), 2*(model_3_pre*model_3_rec)/(model_3_pre+model_3_rec), 2*(precision1*recall1)/(precision1+recall1), 2*(precision2*recall2)/(precision2+recall2)]
bar=ax.bar(langs,test_F1)
for p in bar:
   height = p.get_height()
   ax.annotate('{:.3f}%'.format(height),
      xy=(p.get_x() + p.get_width() / 2, height),
      xytext=(0, -20),
      textcoords="offset points",
      ha='center', va='bottom')
bar[0].set_color('r')
bar[1].set_color('g')
bar[3].set_color('c')
bar[4].set_color('m')
plt.ylim(98,100)
plt.title('F1 Score')
plt.show()