Declaration for data features and methods

In [1]:
# dataset: tf_idf_part (333, 688)
# algorithms need to test: SVC，DecisionTreeClassifier，RidgeClassifierCV, SGDClassifier, MLPClassifier
# generator : input_shape：688（features dims）+20（noise dims）   hidden layer nodes：256  output layer nodes：128
# subsititude detector: 128 - 256 - 1

import the necessary modules

In [1]:
# import modules from tensorflow while not keras directly to advoid some problems in training
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Maximum, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifierCV, SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd

Load the dataset

In [2]:
# get the data from partial TF-IDF features
tf_idf_part = pd.read_pickle('../dataset/training_data/features_ran_part.pkl')
# bypass the label column
tf_idf_part

Unnamed: 0,35700,35701,35702,35707,35708,35709,35710,35711,35712,35713,...,43803,43815,43976,43982,43985,44010,44220,44380,44405,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,1,0,1,1,1,1,1,1,1,1,...,0,0,1,1,1,1,1,1,1,0
329,1,0,1,1,1,1,1,1,1,1,...,0,0,1,1,1,1,1,1,1,0
330,1,0,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,0
331,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,0


Building the GAN by stacking the generator and discriminator

In [4]:
from keras.optimizers import Adam 

class RanGAN():
    def __init__(self, data_path):
        
        self.data = pd.read_pickle(self.data_path)
        self.noise_dims = 100
        self.features_dims = 688
        self.n_epochs = 50
        self.n_batch = 32
        # choose the number based on the total number of features
        self.hide_layers = 128
        self.geneator_layers = [self.features_dims + self.noise_dims, ]
        
        
        optimizer = Adam(0.0001, 0.5)
        
        # build and compile the detector
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer , metrics=['accuracy'])
        
        # build the generator
        self.generator = self.build_generator()
        # generate the adversarial ransomware examples
        example = Input(shape = (self.features_dims))
        noise = Input(shape = (self.noise_dims))
        input_shape = [example, noise]
        adv_ran = self.generator(input_shape)
        
        # The discriminator takes generated images as input and determines validity
        validity = self.discriminator(adv_ran)
        
        # the combined model (stacked generator and discriminator)
        self.combined = Model(input_shape, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer = optimizer)
        
        # for the full GAN, only train the generator
        discriminator.trainable = False
        
        # Building a simple Generator network
        def build_generator(self):
            example = Input(shape=(self.features_dims,))
            noise = Input(shape = (self.noise_dims))
            # create the concatenated dims for input example
            adv_example = Concatenate(axis=1)([example, noise])
            for dim in [256, 128]:
                x = Dense(dim)(adv_example)
            x = Activation(activation='sigmoid')(x)
            x = Maximum()([example, x])
            # multiple input api
            generator = Model(input =[example, noise], output = x, name = 'Generator')
            generator.summary()
            return generator
        
        
        
        # Building a simple Discriminator network
        def build_discriminator(self):
            input_example = Input(shape=(feature_dims,))
            for dim in [feature_dims, 1]:
                x = Dense(dim)(input_example)
            validity = Activation(activation='sigmoid')(x)
            
            discriminator = Model(input_example, validity, name='Discriminator')
            discriminator.summary()
            
            return discriminator

        
        
        def load_data(self):
            # including features values and labels
            x_ran, y_ran, x_ben, y_ben = self.data[self.data['label']==1].iloc[:,:-1], 
                                         self.data[self.data['label']==1].iloc[:,-1], 
                                         self.data[self.data['label']==0].iloc[:,:-1], 
                                         self.data[self.data['label']==0].iloc[:,-1]

            return x_ran, y_ran, x_ben, y_ben
            
        
        def train_gan(self, generator, discriminator):
            
            # calculate the number of batches per epoch
            batches_per_epoch = int(self.dataset.shape[0]/self.n_batch)
            # calculate the number of training iterations
            n_steps = batches_per_epoch * self.n_epochs
            
            # Load and Split the dataset
            (x_ran, y_ran), (x_ben, y_ben) = self.load_data()
        
            
            for step in range(n_steps):
                
                # -----------------------
                # Train Discriminator
                # -----------------------
                
                # 1.select the random ransomware examples with batch size
                idx = np.random.randint(0, x_train_ran.shape[0], n_batch)
                x_ran_batch = x_train_ran[idx]
                
                # sample noise as generator input
                noise = np.random.normal(0, 1, (n_batch, self.noise_dims))
                
                idx = np.random.randint(0, x_ran_batch.shape)
                
                # 2. generate the batch of new ransomware examples
                gen_examples = self.generator.predict([x_ran_batch, noise])
                
                y_ran_batch =  self.discriminator.predict(np.ones(gen_examples.shape)*(gen_examples > 0.5))
                
                # 3. Train the discriminator using both fake and real examples
                # it will update the D's wrights by labeling all real examples as 1
                # and the fake images as 0
                d_loss_real = discriminator.train_on_batch(gen_examples, y_ran_batch)
                d_loss_fake = discriminator.train_on_batch()
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                
                # ------------------------
                # Train Generator
                # ------------------------
                
                # 4. Generate another batch of fake examples
                
                
                # 5. Train the full GAN model using fake examples only
                # update only the G's weights by labeling all fake examples as 1
                fake = generator.predict()
                result = discriminator.predict(fake)

SyntaxError: invalid syntax (<ipython-input-4-5c36f14477e5>, line 2)

In [None]:
if __name__ == "__main__":
    acgan = RanGAN()
    acgan.train(epoch=500, batch_size=32)