In [2]:
from keras.layers import Input, Dense, Activation
from keras.layers import Maximum, Concatenate
from keras.models import Model
from keras.optimizers import adam_v2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from Ensemble_Classifiers import Ensemble_Classifier
from sklearn.model_selection import train_test_split
import numpy as np
global seed

seed = 0

In [3]:
class MalGAN():
    def __init__(self, blackbox, X, Y, threshold):
        self.apifeature_dims = 69
        self.z_dims = 30
        self.generator_layers = [self.apifeature_dims+self.z_dims, 32, 32, 64 , self.apifeature_dims]
        # self.generator_layers = [self.apifeature_dims+self.z_dims, 64, 64, 128 , self.apifeature_dims]

        self.substitute_detector_layers = [self.apifeature_dims, 64, 64, 1]
        # self.substitute_detector_layers = [self.apifeature_dims, 128, 128, 1]
        self.blackbox = blackbox       
        optimizer = adam_v2.Adam(learning_rate=0.0002, beta_1=0.5)
        self.X = X
        self.Y = Y
        self.threshold = threshold

        # Build and Train blackbox_detector
        self.blackbox_detector = self.build_blackbox_detector()

        # Build and compile the substitute_detector
        self.substitute_detector = self.build_substitute_detector()
        self.substitute_detector.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes malware and noise as input and generates adversarial malware examples
        example = Input(shape=(self.apifeature_dims,))
        noise = Input(shape=(self.z_dims,))
        input = [example, noise]
        malware_examples = self.generator(input)

        # The discriminator takes generated images as input and determines validity
        validity = self.substitute_detector(malware_examples)

        # The combined model  (stacked generator and substitute_detector)
        # Trains the generator to fool the discriminator
        self.combined = Model(input, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
        
        # For the combined model we will only train the generator
        self.substitute_detector.trainable = False
    

    def build_blackbox_detector(self):
        if self.blackbox in ['SVM']:
            blackbox_detector = SVC(kernel = 'linear')
        
        elif self.blackbox in ['GB']:
            blackbox_detector = GradientBoostingClassifier(random_state=seed)
        
        elif self.blackbox in ['SGD']:
            blackbox_detector = SGDClassifier(random_state=seed)  

        elif self.blackbox in ['DT']:
            blackbox_detector = DecisionTreeClassifier(random_state=seed)
        
        elif self.blackbox in ['Ensem']:
            blackbox_detector = Ensemble_Classifier()

        return blackbox_detector

    def build_generator(self):

        example = Input(shape=(self.apifeature_dims,))
        noise = Input(shape=(self.z_dims,))
        x = Concatenate(axis=1)([example, noise])
        for dim in self.generator_layers[1:]:
            x = Dense(dim)(x)
        x = Activation(activation='tanh')(x)
        x = Maximum()([example, x])
        generator = Model([example, noise], x, name='generator')
        generator.summary()
        return generator

    def build_substitute_detector(self):

        input = Input(shape=(self.substitute_detector_layers[0],))
        x = input
        for dim in self.substitute_detector_layers[1:]:
            x = Dense(dim)(x)
        x = Activation(activation='sigmoid')(x)
        substitute_detector = Model(input, x, name='substitute_detector')
        substitute_detector.summary()
        return substitute_detector

    def load_data(self):
        x_ben, x_ran,y_ben, y_ran = self.X[:self.threshold], self.X[self.threshold:], self.Y[:self.threshold], self.Y[self.threshold:]

        return (x_ran, y_ran), (x_ben, y_ben)
    
    
    def train(self, epochs, batch_size=32):

        # Load and Split the dataset
        (xmal, ymal), (xben, yben) = self.load_data()
        xtrain_mal, xtest_mal, ytrain_mal, ytest_mal = train_test_split(xmal, ymal, test_size=0.50)
        xtrain_ben, xtest_ben, ytrain_ben, ytest_ben = train_test_split(xben, yben, test_size=0.50)

        bl_xtrain_mal, bl_ytrain_mal, bl_xtrain_ben, bl_ytrain_ben = xtrain_mal, ytrain_mal, xtrain_ben, ytrain_ben

        
        self.blackbox_detector.fit(np.concatenate([xmal, xben]), np.concatenate([ymal, yben]))

        ytrain_ben_blackbox = self.blackbox_detector.predict(bl_xtrain_ben)
        
        Original_Train_TPR = self.blackbox_detector.score(bl_xtrain_mal, bl_ytrain_mal)
        
        Original_Test_TPR = self.blackbox_detector.score(xtest_mal, ytest_mal)
        Train_TPR, Test_TPR = [Original_Train_TPR], [Original_Test_TPR]


        for epoch in range(epochs):

            for step in range(xtrain_mal.shape[0] // batch_size):
                # ---------------------
                #  Train substitute_detector
                # ---------------------

                # Select a random batch of malware examples
                idx_mal = np.random.randint(0, xtrain_mal.shape[0], batch_size)
  
                xmal_batch = xtrain_mal[idx_mal]
                
                noise = np.random.normal(0, 1, (batch_size, self.z_dims))
                
                idx_ben = np.random.randint(0, xmal_batch.shape[0], batch_size)
                
                xben_batch = xtrain_ben[idx_ben]
                yben_batch = ytrain_ben_blackbox[idx_ben]

                # Generate a batch of new malware examples
                gen_examples = self.generator.predict([xmal_batch, noise])
                ymal_batch = self.blackbox_detector.predict(np.ones(gen_examples.shape)*(gen_examples > 0.5))

                # Train the substitute_detector

                d_loss_real = self.substitute_detector.train_on_batch(gen_examples, ymal_batch)
                d_loss_fake = self.substitute_detector.train_on_batch(xben_batch, yben_batch)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

                # ---------------------
                #  Train Generator
                # ---------------------

                idx = np.random.randint(0, xtrain_mal.shape[0], batch_size)
                xmal_batch = xtrain_mal[idx]
                noise = np.random.uniform(0, 1, (batch_size, self.z_dims))

                # Train the generator
                g_loss = self.combined.train_on_batch([xmal_batch, noise], np.zeros((batch_size, 1)))

            # Compute Train TPR
            noise = np.random.uniform(0, 1, (xtrain_mal.shape[0], self.z_dims))
            gen_examples = self.generator.predict([xtrain_mal, noise])
            TPR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytrain_mal)
            Train_TPR.append(TPR)

            # Compute Test TPR
            noise = np.random.uniform(0, 1, (xtest_mal.shape[0], self.z_dims))
            gen_examples = self.generator.predict([xtest_mal, noise])
            TPR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytest_mal)
            Test_TPR.append(TPR)

            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))
            
            if int(epoch) == int(epochs-1):
                return  d_loss[0], 100*d_loss[1], g_loss
        

In [4]:
# create the dict to save the D loss, acc and G loss for different classifiers
D_loss_dict, Acc_dict, G_loss_dict = {}, {}, {}
# get the data from Feature-Selector
import pandas as pd

df= pd.read_csv('../dataset/matrix/CLaMP.csv')



In [5]:
df.dtypes.value_counts()

int64      66
float64     3
object      1
dtype: int64

In [6]:
df.columns

Index(['e_cblp', 'e_cp', 'e_cparhdr', 'e_maxalloc', 'e_sp', 'e_lfanew',
       'NumberOfSections', 'CreationYear', 'FH_char0', 'FH_char1', 'FH_char2',
       'FH_char3', 'FH_char4', 'FH_char5', 'FH_char6', 'FH_char7', 'FH_char8',
       'FH_char9', 'FH_char10', 'FH_char11', 'FH_char12', 'FH_char13',
       'FH_char14', 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'OH_DLLchar0', 'OH_DLLchar1',
       'OH_DLLchar2', 'OH_DLLchar3', 'OH_DLLchar4', 'OH_DLLchar5',
       'OH_DLLchar6', 'OH_DLLchar7', 'OH_DLLchar8', 'OH_DLLchar9',
       'OH_DLLchar10', 'SizeOfStackReserve', 'SizeO

In [7]:
# encode categorical column
from sklearn.preprocessing import LabelEncoder
df['packer_type'] = LabelEncoder().fit_transform(df['packer_type'])

In [8]:
df['packer_type'].value_counts()

18    4395
32     231
4      154
5      110
16      93
17      90
7       25
31      14
15      13
27       9
10       9
24       8
20       8
28       7
25       5
29       4
3        4
0        3
36       2
33       2
39       2
30       2
6        2
11       2
35       1
23       1
8        1
12       1
19       1
38       1
34       1
26       1
2        1
1        1
22       1
9        1
13       1
14       1
21       1
37       1
Name: packer_type, dtype: int64

In [9]:
Y = df['class'].values
X = df.drop('class', axis=1).values

In [10]:
X.shape

(5210, 69)

In [13]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(X)


array([[3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        7.12092577e-03, 8.05612826e-01, 1.00000000e+00],
       [3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        3.70775942e-05, 6.20165013e-01, 0.00000000e+00],
       [3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        3.39974503e-04, 7.88645721e-01, 1.00000000e+00],
       ...,
       [3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        1.36260159e-03, 9.74947062e-01, 0.00000000e+00],
       [3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        1.62986925e-03, 9.83855633e-01, 0.00000000e+00],
       [3.89105058e-03, 1.49625935e-04, 3.11259824e-04, ...,
        7.78629479e-04, 9.07294642e-01, 1.00000000e+00]])

In [1]:
X

NameError: name 'X' is not defined

In [34]:
from collections import Counter
Counter(Y)

Counter({0: 2488, 1: 2722})

In [35]:
# load the classifier
for classifier in [ 'SVM', 'SGD', 'DT', 'GB', 'Ensem']: 
    print('[+] \nTraining the model with {} classifier\n'.format(classifier))
    malgan = MalGAN(blackbox=classifier, X=X, Y=Y, threshold = 2488)
    d_loss, acc, g_loss = malgan.train(epochs=50, batch_size=32)

    D_loss_dict[classifier] = d_loss
    Acc_dict[classifier] = acc 
    G_loss_dict[classifier] = g_loss


print('=====================')
print(D_loss_dict)
print('=====================')
print(Acc_dict)
print('=====================')
print(G_loss_dict)

[+] 
Training the model with SVM classifier

Model: "substitute_detector"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_26 (InputLayer)       [(None, 69)]              0         
                                                                 
 dense_35 (Dense)            (None, 64)                4480      
                                                                 
 dense_36 (Dense)            (None, 64)                4160      
                                                                 
 dense_37 (Dense)            (None, 1)                 65        
                                                                 
 activation_10 (Activation)  (None, 1)                 0         
                                                                 
Total params: 8,705
Trainable params: 8,705
Non-trainable params: 0
_________________________________________________________________


In [39]:

matrix_dict = {}

for key, value in D_loss_dict.items():
    matrix_dict[key] = []


for key, value in D_loss_dict.items():
    matrix_dict[key].append(D_loss_dict[key])
    matrix_dict[key].append(Acc_dict[key])
    matrix_dict[key].append(G_loss_dict[key])

In [40]:
import pandas as pd

df = pd.DataFrame.from_dict(matrix_dict, orient='columns') 
df.index= list([ 'D_Loss', 'Acc', 'G_Loss'])
df

Unnamed: 0,SVM,SGD,DT,GB,Ensem
D_Loss,0.2143743,0.1366699,0.182178,0.085044,0.319834
Acc,92.1875,95.3125,92.1875,98.4375,89.0625
G_Loss,3.931344e-08,2.905405e-07,0.000247,1.4e-05,0.001135


In [38]:
import dataframe_image as dfi
dfi.export(df, '64_mal_matrix.png')