In [3]:
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU
from numpy.random import randn, randint
from numpy import ones
from numpy import zeros
from numpy.random import uniform, randint
import pandas as pd
import numpy as np
from keras.models import load_model
from sklearn.model_selection import train_test_split

In [4]:
# generator : input_dim：688（features_dims）+ 100（noise_dims）  hidden_layers：256 - 512 -1024  output：688
# subsititude detector: input_dim：688   hidden_layers: 512 - 512 - 512 - 1

In [5]:
# define the number of layers and dims
features_dims = 688
noise_dims = 100
latent_dim = 100
generator_layers = [features_dims+noise_dims, 256, 512, 1024, features_dims]
discriminator_layers = [features_dims, 512, 512, 512, 1 ]

define the standalone discriminator model

In [6]:
# def build_disriminator(discriminator_layers):
#     model = Sequential()
#     model.add(Dense(512, input_shape=(discriminator_layers[0],)))
#     model.add(LeakyReLU(alpha=0.05))
#     model.add(Dense(512))
#     model.add(LeakyReLU(alpha=0.05))
#     model.add(Dense(512))
#     model.add(LeakyReLU(alpha=0.05))
#     # one unit for fake or real
#     model.add(Dense(1, activation = 'sigmoid'))
#     optimizer=Adam(lr=0.0002, beta_1=0.5)
#     # only two classes --- binary_crossentropy
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     model.summary()
#     return model

def build_disriminator(discriminator_layers):
    model = Sequential()
    model.add(Dense(512, activation = 'relu', input_shape=(discriminator_layers[0],)))
    model.add(Dense(512, activation = 'relu'))
    model.add(Dense(512, activation = 'relu'))
    # one unit for fake or real
    model.add(Dense(1, activation = 'sigmoid'))
    optimizer=Adam(lr=0.0002, beta_1=0.5)
    # only two classes --- binary_crossentropy
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    return model

define the standalone generator model

In [7]:
# it is not trained directly --- no comiple here
# def build_generator(latent_dim):
#     model = Sequential()
#     # use the dims of features as the units number
#     model.add(Dense(256, input_dim=latent_dim))
#     model.add(LeakyReLU(alpha=0.05))
#     model.add(Dense(512))
#     model.add(LeakyReLU(alpha=0.05))
#     model.add(Dense(1024))
#     model.add(LeakyReLU(alpha=0.05))
#     model.add(Dense(generator_layers[-1], activation = "tanh"))
#     model.summary()
#     return model

def build_generator(latent_dim):
    model = Sequential()
    # use the dims of features as the units number
    model.add(Dense(256, activation = 'relu', input_dim=latent_dim))
    model.add(Dense(512, activation = 'relu' ))
    model.add(Dense(1024, activation = 'relu'))
    model.add(Dense(generator_layers[-1], activation = "tanh"))
    model.summary()
    return model

build blackbox detector for single classifier

In [8]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV, SGDClassifier
from sklearn.neural_network import MLPClassifier

global seed
seed = 0
# the classifiers include: SVC, DT, RC，SGD， MLP
def build_blackbox_classifier_detector(blackbox, seed):
    if blackbox in ['SVM']:
        blackbox_detector = SVC(kernel = 'linear')
    elif blackbox in ['DT']:
        blackbox_detector = DecisionTreeClassifier(random_state=seed)
    elif blackbox in ['RC']:
        blackbox_detector = RidgeClassifierCV()
    elif blackbox in ['SGD']:
        blackbox_detector = SGDClassifier(random_state=seed)
    elif blackbox in ['MLP']:
        blackbox_detector = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4,
                                              solver='sgd', verbose=0, tol=1e-4, random_state=seed,
                                              learning_rate_init=.1)
    
    return blackbox_detector

build blackbox detector for ensemble classifiers

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# the ensemble classifiers include: Bag, RF, Adaboost, GB
def build_blackbox_ensemble_classifier_detector(blackbox, seed):
    if blackbox in ['Bag']:
        blackbox_detector = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=seed)
    elif blackbox in ['RF']:
        blackbox_detector = RandomForestClassifier(max_depth=2, random_state=seed)
    elif blackbox in ['AdaBoost']:
        blackbox_detector = AdaBoost_model(n_estimators=100, random_state=seed)
    elif blackbox in ['GB']:
        blackbox_detector = GradientBoostingClassifier(n_estimators=100, random_state=seed)
    
    return blackbox_detector

define the combined GAN model

In [10]:
# define the combined generator and disciminator model, for updating the generator
def define_gan(generator, discriminator):
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    optimizer = Adam(lr=0.0002, beta_1=0.5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    model.summary()
    return model

load input data

In [11]:
def load_data(data_path):
    # get the data from partial TF-IDF features
    tf_idf_part = pd.read_pickle(data_path)
    data = tf_idf_part
    # tf_idf_part.cov()
    x_ran, y_ran, x_ben, y_ben = data[data['label']==1].iloc[:,:-1], data[data['label']==1].iloc[:,-1], data[data['label']==0].iloc[:,:-1], data[data['label']==0].iloc[:,-1]

    return (x_ran, y_ran), (x_ben, y_ben)
# define the path for dataset
data_path = '../dataset/training_data/features_ran_part.pkl'

require one batch (or a half) of real data to update to the GAN model 

In [12]:
# select real samples
def generate_real_samples(dataset, n_samples):
    # choose random instance
    ix = np.random.randint(0, dataset.shape[0], n_samples)
    # select data
    X = dataset[ix]
    # generate class labels
    y = np.ones(n_samples)
    return X, y

create the input for generator model (Gaussian Distributed Random Variables)

In [13]:
# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples):
    # generate points in the latent space
    x_input = uniform(0,1,(latent_dim * n_samples))
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input


generate fake examples

In [14]:
# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n_samples):
    # generate points in latent space
    x_input = generate_latent_points(latent_dim, n_samples)
    # predict outputs
    X = generator.predict(x_input)
    # create class labels
    y = np.zeros(n_samples)
    return X,y

train the stacked GAN model

In [15]:
# train the generator and discriminator
# the D is updated for a half batch of real samples, then a half batch of fake samples
# the generator is then updated via the composite gan model
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs=100, n_batch=32):
    bat_per_epo = int(dataset.shape[0]/ n_batch)
    half_batch = int(n_batch / 2)
    for i in range(n_epochs):
        for j in range(bat_per_epo):
            
            # get randomly selected 'real' samples
            X_real, y_real = generate_real_samples(dataset, half_batch)
            # update discriminator model weights
            d_loss1, d_acc1 = d_model.train_on_batch(X_real, y_real)
            # generate 'fake' examples
            X_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
            # update discriminator model weights
            d_loss2, d_acc2 = d_model.train_on_batch(X_fake, y_fake)
            
            # prepare points in latent space as input for the generator
            X_gan = generate_latent_points(latent_dim, n_batch)
            # create inverted labels for fake samples
            y_gan = ones((n_batch, ))
            # update the generator via the discriminator's error
            g_loss = gan_model.train_on_batch(X_gan, y_gan)
            # summarize loss on this batch
            d_loss = 0.5 * np.add(d_loss1, d_loss2)
            d_acc = 0.5 * np.add(d_acc1, d_acc2)
            print('>%d, %d/%d, D_loss=%.3f, acc=%.3f G_loss=%.3f' %
                (i+1, j+1, bat_per_epo, d_loss, d_acc, g_loss))
    # save the generator model
    g_model.save('generator.h5')

In [16]:
def load_data(data):

    x_ran, y_ran, x_ben, y_ben = data[data['label']==1].iloc[:,:-1], data[data['label']==1].iloc[:,-1], data[data['label']==0].iloc[:,:-1], data[data['label']==0].iloc[:,-1]

    return (x_ran, y_ran), (x_ben, y_ben)

In [17]:
# size of the latent space
latent_dim = 100
# create the discriminator
discriminator = build_disriminator(discriminator_layers)
# create the generator
generator = build_generator(latent_dim)
# create the gan
gan_model = define_gan(generator, discriminator)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               352768    
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 513       
Total params: 878,593
Trainable params: 878,593
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 256)               25856     
__________________________



train the GAN with generator and discriminator in it

In [18]:
# load ransomware data
# get the data from partial TF-IDF features
tf_idf_part = pd.read_pickle('../dataset/training_data/features_ran_part.pkl')
# bypass the label column

(x_ran, y_ran), (x_ben, y_ben) = load_data(tf_idf_part)
# train model
train(generator, discriminator, gan_model, x_ran.values, latent_dim)



>1, 1/5, D_loss=0.693, acc=0.500 G_loss=0.796
>1, 2/5, D_loss=0.687, acc=0.500 G_loss=0.712
>1, 3/5, D_loss=0.670, acc=0.500 G_loss=0.667
>1, 4/5, D_loss=0.663, acc=0.500 G_loss=0.641
>1, 5/5, D_loss=0.656, acc=0.500 G_loss=0.611
>2, 1/5, D_loss=0.650, acc=0.500 G_loss=0.589
>2, 2/5, D_loss=0.665, acc=0.500 G_loss=0.568
>2, 3/5, D_loss=0.684, acc=0.500 G_loss=0.543
>2, 4/5, D_loss=0.712, acc=0.500 G_loss=0.520
>2, 5/5, D_loss=0.728, acc=0.500 G_loss=0.498
>3, 1/5, D_loss=0.760, acc=0.500 G_loss=0.492
>3, 2/5, D_loss=0.751, acc=0.500 G_loss=0.493
>3, 3/5, D_loss=0.750, acc=0.500 G_loss=0.502
>3, 4/5, D_loss=0.742, acc=0.500 G_loss=0.532
>3, 5/5, D_loss=0.703, acc=0.500 G_loss=0.566
>4, 1/5, D_loss=0.681, acc=0.500 G_loss=0.587
>4, 2/5, D_loss=0.655, acc=0.500 G_loss=0.595
>4, 3/5, D_loss=0.650, acc=0.500 G_loss=0.614
>4, 4/5, D_loss=0.618, acc=0.500 G_loss=0.634
>4, 5/5, D_loss=0.591, acc=0.500 G_loss=0.660
>5, 1/5, D_loss=0.560, acc=0.500 G_loss=0.688
>5, 2/5, D_loss=0.553, acc=0.500 G

>36, 3/5, D_loss=0.810, acc=0.500 G_loss=0.401
>36, 4/5, D_loss=0.810, acc=0.500 G_loss=0.399
>36, 5/5, D_loss=0.824, acc=0.500 G_loss=0.398
>37, 1/5, D_loss=0.823, acc=0.500 G_loss=0.397
>37, 2/5, D_loss=0.823, acc=0.500 G_loss=0.400
>37, 3/5, D_loss=0.832, acc=0.500 G_loss=0.407
>37, 4/5, D_loss=0.821, acc=0.500 G_loss=0.404
>37, 5/5, D_loss=0.828, acc=0.500 G_loss=0.404
>38, 1/5, D_loss=0.825, acc=0.500 G_loss=0.402
>38, 2/5, D_loss=0.821, acc=0.500 G_loss=0.406
>38, 3/5, D_loss=0.837, acc=0.500 G_loss=0.404
>38, 4/5, D_loss=0.820, acc=0.500 G_loss=0.409
>38, 5/5, D_loss=0.819, acc=0.500 G_loss=0.409
>39, 1/5, D_loss=0.817, acc=0.500 G_loss=0.407
>39, 2/5, D_loss=0.814, acc=0.500 G_loss=0.408
>39, 3/5, D_loss=0.814, acc=0.500 G_loss=0.410
>39, 4/5, D_loss=0.806, acc=0.500 G_loss=0.411
>39, 5/5, D_loss=0.811, acc=0.500 G_loss=0.412
>40, 1/5, D_loss=0.807, acc=0.500 G_loss=0.409
>40, 2/5, D_loss=0.830, acc=0.500 G_loss=0.408
>40, 3/5, D_loss=0.827, acc=0.500 G_loss=0.405
>40, 4/5, D_l

>71, 5/5, D_loss=0.869, acc=0.500 G_loss=0.417
>72, 1/5, D_loss=0.869, acc=0.500 G_loss=0.422
>72, 2/5, D_loss=0.858, acc=0.500 G_loss=0.415
>72, 3/5, D_loss=0.865, acc=0.500 G_loss=0.400
>72, 4/5, D_loss=0.861, acc=0.500 G_loss=0.401
>72, 5/5, D_loss=0.868, acc=0.500 G_loss=0.399
>73, 1/5, D_loss=0.860, acc=0.500 G_loss=0.407
>73, 2/5, D_loss=0.842, acc=0.500 G_loss=0.406
>73, 3/5, D_loss=0.841, acc=0.500 G_loss=0.408
>73, 4/5, D_loss=0.837, acc=0.500 G_loss=0.398
>73, 5/5, D_loss=0.842, acc=0.500 G_loss=0.400
>74, 1/5, D_loss=0.844, acc=0.500 G_loss=0.394
>74, 2/5, D_loss=0.844, acc=0.500 G_loss=0.392
>74, 3/5, D_loss=0.853, acc=0.500 G_loss=0.394
>74, 4/5, D_loss=0.865, acc=0.500 G_loss=0.385
>74, 5/5, D_loss=0.879, acc=0.500 G_loss=0.382
>75, 1/5, D_loss=0.878, acc=0.500 G_loss=0.387
>75, 2/5, D_loss=0.886, acc=0.500 G_loss=0.394
>75, 3/5, D_loss=0.879, acc=0.500 G_loss=0.394
>75, 4/5, D_loss=0.883, acc=0.500 G_loss=0.407
>75, 5/5, D_loss=0.875, acc=0.500 G_loss=0.408
>76, 1/5, D_l

use the saved G model to spoof the black_detector

In [19]:
def train_classifiers(generator, black_detector, data_path, latent_dim, n_epochs=100, n_batch=32):
    # train the black_detector with total_dataset(real mal and ben examples)
    (x_ran, y_ran), (x_ben, y_ben) = load_data(data_path)
    black_detector.fit(np.concatenate([x_ran, x_ben]), np.concatenate([y_ran, y_ben]))
    
    # split into training and testing datasets
    
#     bat_per_epo = int(dataset.shape[0]/ n_batch)
#     half_batch = int(n_batch / 2)

    for i in range(n_epochs):
        for j in range(bat_per_epo):
            
            # get randomly selected 'real' samples
            X_real, y_real = generate_real_samples(dataset, half_batch)
            # update discriminator model weights
            d_loss1, d_acc1 = d_model.train_on_batch(X_real, y_real)
            
            # generate 'fake' examples
            X_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
            # update discriminator model weights
            d_loss2, d_acc2 = d_model.train_on_batch(X_fake, y_fake)
            
            # prepare points in latent space as input for the generator
            X_gan = generate_latent_points(latent_dim, n_batch)
            # create inverted labels for fake samples
            y_gan = ones((n_batch, ))
            # update the generator via the discriminator's error
            g_loss = gan_model.train_on_batch(X_gan, y_gan)
            # summarize loss on this batch
            d_loss = 0.5 * np.add(d_loss1, d_loss2)
            d_acc = 0.5 * np.add(d_acc1, d_acc2)
            print('>%d, %d/%d, D_loss=%.3f, acc=%.3f G_loss=%.3f' %
                (i+1, j+1, bat_per_epo, d_loss, d_acc, g_loss))


train the GAN with generator and detector in it

In [None]:
generator = load_model('./generator.h5')

for blackbox in []:
    blackbox_detector = build_blackbox_classifier_detector(blackbox, seed)


In [None]:
for blackbox in []:
    blackbox_ensemble_detector = build_blackbox_ensemble_classifier_detector(blackbox, seed)

In [None]:
'''
1. Parameters: 
     100 epochs, 32 batch, LeakyRelu：
   Layers：
     D:512-512-512-1  G:256-512-1024-688
   Result:
     >100, 5/5, D_loss=0.740, acc=0.500 G_loss=0.608

2. Parameters: 
    100 epochs, 32 batch, relu:
   Layers：
    D:512-512-512-1  G:256-512-1024-688 
   Result:
    >100, 5/5, D_loss=0.793, acc=0.500 G_loss=0.422
'''
