In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import mysql.connector
import json
import pickle
import os
from collections import namedtuple
from gensim.models import doc2vec, word2vec
from gensim.models.doc2vec import Doc2Vec
from sklearn.model_selection import KFold
from keras import backend as K
from keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support
from math import floor, ceil
import time
import tensorflow as tf
from tensorflow.keras import layers
import networkx as nx
from PreprocessingUtilies import getTaggedDocumentRepresentation, splitFunctionNameListIntoWords, encodePurpose, decodePurpose
from graph2vec.graph2vecModule import Graph2VecModel
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

with open('config.json') as config_file:
    config = json.load(config_file)

def getMySQLConnector():
    return mysql.connector.connect(
        host=config["mysql_host"],
        user=config["mysql_user"],
        passwd=config["mysql_password"],
        database=config["mysql_database"],
        auth_plugin='mysql_native_password',
        charset='utf8',
        use_unicode=True
    )
    

In [None]:
def getAllEntries():
    
    entries = []
    selectQuery = '''SELECT  
                FileHash,
                ImportFunctions,
                ExportFunctions,
                NumberOfFunctions,
                WasmFileSize,
                WatFileSize,
                ExpansionFactor,
                IsAsm,
                TotalLinesOfCode,
                MinFunctionLinesOfCode,
                MaxFunctionLinesOfCode,
                AvgFunctionLinesOfCode,
                NumberOfTypes,
                NumberOfImports,
                NumberOfExports,
                NumberOfDataSections,
                NumberOfTableEntries
            FROM wat_details
            WHERE Purpose IS NOT NULL 
            AND Purpose <> 'Unit Test' '''
    
    db = getMySQLConnector()
    cursor = db.cursor()

    cursor.execute(selectQuery)
    features = ['FileHash',
                'ImportFunctions',
                'ExportFunctions',
                'NumberOfFunctions',
                'WasmFileSize',
                'WatFileSize',
                'ExpansionFactor',
                'IsAsm',
                'TotalLinesOfCode',
                'MinFunctionLinesOfCode',
                'MaxFunctionLinesOfCode',
                'AvgFunctionLinesOfCode',
                'NumberOfTypes',
                'NumberOfImports',
                'NumberOfExports',
                'NumberOfDataSections',
                'NumberOfTableEntries']
    dbResults = cursor.fetchall()
    
    for row in dbResults:
        rowDict = {}
        for i, col in enumerate(row):
            rowDict[features[i]] = col
        entries.append(rowDict)
    db.close()
    return entries

def getGraphs():
    
    entries = []
    selectQuery = '''SELECT  
                CFGEdgeList
            FROM wat_details  WHERE CFGEdgeList IS NOT NULL'''
    
    db = getMySQLConnector()
    cursor = db.cursor()

    cursor.execute(selectQuery)
    
    dbResults = cursor.fetchall()
    
    for row in dbResults:
        graphJSON = json.loads(row[0])
        graph = nx.Graph()
        graph.add_nodes_from(graphJSON['nodes'])
        graph.add_edges_from(graphJSON['edges'])
        entries.append(graph)
        
    db.close()
    return entries

def getLabeledEntries():
    entries = []
    selectQuery = '''SELECT  
                FileHash,
                ImportFunctions,
                ExportFunctions,
                NumberOfFunctions,
                WasmFileSize,
                WatFileSize,
                ExpansionFactor,
                IsAsm,
                TotalLinesOfCode,
                MinFunctionLinesOfCode,
                MaxFunctionLinesOfCode,
                AvgFunctionLinesOfCode,
                NumberOfTypes,
                NumberOfImports,
                NumberOfExports,
                NumberOfDataSections,
                NumberOfTableEntries,
                Purpose
            FROM wat_details
            WHERE Purpose IS NOT NULL 
            AND Purpose <> 'Unit Test'
            ORDER BY RAND()'''
    
    balancedQuery = '''
        SELECT FileHash,ImportFunctions,
                        ExportFunctions,
                        NumberOfFunctions,
                        WasmFileSize,
                        WatFileSize,
                        ExpansionFactor,
                        IsAsm,
                        TotalLinesOfCode,
                        MinFunctionLinesOfCode,
                        MaxFunctionLinesOfCode,
                        AvgFunctionLinesOfCode,
                        NumberOfTypes,
                        NumberOfImports,
                        NumberOfExports,
                        NumberOfDataSections,
                        NumberOfTableEntries,
                        Purpose
        FROM (
            SELECT *,
                @current_rank := IF(
                    @current_purpose = Purpose, 
                    @current_rank + 1, 
                    1
                ) AS current_rank,
                @current_purpose := Purpose
            FROM wat_details
            WHERE Purpose IS NOT NULL 
            AND Purpose <> 'Unit Test'
            ORDER BY Purpose, RAND() 
        ) a 
        WHERE current_rank <= 100
        ORDER BY RAND();
    '''
    # WHERE current_rank <= 20;
    db = getMySQLConnector()
    cursor = db.cursor()

    cursor.execute(balancedQuery)
    cols = ['FileHash',
                'ImportFunctions',
                'ExportFunctions',
                'NumberOfFunctions',
                'WasmFileSize',
                'WatFileSize',
                'ExpansionFactor',
                'IsAsm',
                'TotalLinesOfCode',
                'MinFunctionLinesOfCode',
                'MaxFunctionLinesOfCode',
                'AvgFunctionLinesOfCode',
                'NumberOfTypes',
                'NumberOfImports',
                'NumberOfExports',
                'NumberOfDataSections',
                'NumberOfTableEntries',
                # 'CFGEdgeList',
                'Purpose']
    dbResults = cursor.fetchall()
    
    for row in dbResults:
        rowDict = {}
        for i, col in enumerate(row):
            rowDict[cols[i]] = col
        entries.append(rowDict)
    db.close()
    return entries


In [None]:
#Make Doc2Vec Model

#get all entries
trainingEntries = getAllEntries()
#get ImportFunctions
importFunctions = list(map(lambda entry: entry['ImportFunctions'], trainingEntries))
#get ExportFunctions
exportFunctions = list(map(lambda entry: entry['ExportFunctions'], trainingEntries))

#combine array
allFunctions = importFunctions + exportFunctions
allFunctions = list(map(lambda x: 'Empty' if x == '' else x, allFunctions))

#getTaggedRepresentation for all
taggedNames = list(map(getTaggedDocumentRepresentation, allFunctions))
taggedNames = [item for sublist in taggedNames for item in sublist]
#train model
functionNameDoc2Vec = Doc2Vec(taggedNames, vector_size=10, window=2, workers=4, verbose=0)
with open('functionNameDoc2Vec.pkl', 'wb') as f:
    pickle.dump(functionNameDoc2Vec, f)
    

# Graph Classification (Commented out due to performance decrease)

# graphList = getGraphs()
# graphModel = Graph2VecModel(graphList)
# with open('graph2vec.pkl', 'wb') as f:
#     pickle.dump(graphModel, f)

def getFunctionNameEmbedding(functionNames):
    splitList = splitFunctionNameListIntoWords(functionNames)
    flattenedSplitList = [item for sublist in splitList for item in sublist]
    return functionNameDoc2Vec.infer_vector(flattenedSplitList)

def preprocessFeatures(features):
    #Word Embeddings
    #1 Split camelCase, hyphens, underscore to get word tokens in function names
    #2 Treat function name as sentence
    #3 Treat ImportFunctions/ExportFunction as paragraph
    
    importFunctionsVector = getFunctionNameEmbedding(features['ImportFunctions'])
    exportFunctionsVector = getFunctionNameEmbedding(features['ExportFunctions'])
    # graphVector = graphModel.infer_graph_vector(features['CFGEdgeList'])
    
    #Feature Tuple: NumberOfFunctions | WasmFileSize | WatFileSize | ExpansionFactor | IsAsm | 
    #TotalLinesOfCode | MinFunctionLinesOfCode | MaxFunctionLinesOfCode | AvgFunctionLineOfCode |
    # NumberOfTypes | ImportFunctions | ExportFunctions
    featuresOfInterest = ['NumberOfFunctions', 'WasmFileSize', 'WatFileSize', 'ExpansionFactor', 'IsAsm',
                         'TotalLinesOfCode', 'MinFunctionLinesOfCode', 'MaxFunctionLinesOfCode', 
                         'AvgFunctionLinesOfCode', 'NumberOfTypes', 'NumberOfImports', 'NumberOfExports',
                          'NumberOfDataSections', 'NumberOfTableEntries']

    featuresOfInterest = map(lambda feature: features[feature], featuresOfInterest) 
    featuresOfInterest = [y for x in [featuresOfInterest,
                                      importFunctionsVector,
                                      exportFunctionsVector 
#                                       ,graphVector 
                                     ] for y in x]

    return featuresOfInterest
#Make training data for models
labeledData = getLabeledEntries()

labeledData = list(map(lambda entry: (preprocessFeatures(entry), encodePurpose(entry['Purpose'])), labeledData))

trainFeatures = np.array(list(map(lambda x: x[0], labeledData)))
trainTargets = np.array(list(map(lambda x: x[1], labeledData)))

INPUT_SHAPE = trainFeatures.shape[1]
print(trainFeatures.shape,trainTargets.shape)

# Model Saving

In [None]:
naiveBayesClassifier = OneVsRestClassifier(BernoulliNB(alpha=2), n_jobs=-1)
naiveBayesClassifier.fit(trainFeatures, trainTargets)

with open('nBClassifier.pkl', 'wb') as f:
        pickle.dump(naiveBayesClassifier, f)
print("Naive Bayes model saved to nBClassifier.pkl")

In [None]:
randomForestClassifier = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42)
randomForestClassifier = randomForestClassifier.fit(trainFeatures,trainTargets)

with open('rfClassifier.pkl', 'wb') as f:
        pickle.dump(randomForestClassifier, f)
print('Random forest model saved to rfClassifier.pkl')

In [None]:
svmClassifier = OneVsRestClassifier(SVC(kernel = 'linear' , C = 1, gamma='auto', probability=True), n_jobs=-1)
svmClassifier.fit(trainFeatures, trainTargets)

with open('svmClassifier.pkl', 'wb') as f:
    pickle.dump(svmClassifier, f)
print('SVM model saved to svmClassifier.pkl')

In [None]:
neuralModel = tf.keras.Sequential()
neuralModel.add(layers.Dense(1000,activation='tanh', input_dim=INPUT_SHAPE))
for _ in range(8):
    neuralModel.add(layers.Dense(1000, activation='tanh'))
neuralModel.add(layers.Dense(11, activation='softmax'))
neuralModel.compile(optimizer=tf.keras.optimizers.RMSprop(0.000001),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
neuralModel.fit(trainFeatures,  trainTargets, epochs=1500, batch_size=20, verbose=1)
neuralModel.save("neuralNetwork.h5")
print('Neural network model saved to neuralNetwork.h5')

# Evaluation Data


In [None]:
kf = KFold(n_splits=10)
naiveAvgStats = {}
globalNaiveBestAcc = 0
naiveTimes = []
naivePredictTimes = []

naiveAlphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 0.9, 1, 2, 5, 10]
for train_index, test_index in kf.split(trainFeatures):
    for alpha in naiveAlphas:
        if alpha not in naiveAvgStats.keys():
            naiveAvgStats[alpha] = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}
        X_train, X_test =  trainFeatures[train_index], trainFeatures[test_index]
        y_train, y_test = trainTargets[train_index], trainTargets[test_index]
        start = time.time()
        naiveBayesClassifier = OneVsRestClassifier(BernoulliNB(alpha=alpha), n_jobs=-1)
        naiveBayesClassifier.fit(X_train, y_train)
        end = time.time()
        naiveTimes.append(end - start)
        
        predictstart = time.time()
        naivePrediction = naiveBayesClassifier.predict(X_test)
        predictend = time.time()
        naivePredictTimes.append(predictend - predictstart)

        predictedLabels = list(map(lambda ar: decodePurpose(ar),naivePrediction))
        targetLabels = list(map(lambda ar: decodePurpose(ar),np.array(y_test)))

        naiveAcc = accuracy_score(np.array(y_test),np.array(naivePrediction))
        naiveStats = precision_recall_fscore_support(predictedLabels,targetLabels, average='weighted')
        
            
        if naiveAcc > globalNaiveBestAcc:
            globalNaiveBestAcc = naiveAcc
            

        
        naiveAvgStats[alpha]['Accuracy'].append(naiveAcc)
        naiveAvgStats[alpha]['Precision'].append(naiveStats[0])
        naiveAvgStats[alpha]['Recall'].append(naiveStats[1])
        naiveAvgStats[alpha]['F1'].append(naiveStats[2])
        
#         print('\nAlpha: ', alpha)
#         print(naiveAcc)
#         print(naiveStats)
for alpha in naiveAlphas:
    print('Alpha', alpha)
    print('Average Precision: ', np.mean(np.array(naiveAvgStats[alpha]['Precision'])))
    print('Average Recall: ', np.mean(np.array(naiveAvgStats[alpha]['Recall'])))
    print('Average Accuracy: ', np.mean(np.array(naiveAvgStats[alpha]['Accuracy'])))
    print('Average F1: ', np.mean(np.array(naiveAvgStats[alpha]['F1'])))
    print('\n')
    


In [None]:
kf = KFold(n_splits=10)
rfAvgStats = {}
rfTimes= []
rfPredictTimes = []
rfNumClassifiers = [10,20,50,80,100,150,200,500]
for train_index, test_index in kf.split(trainFeatures):
    for num_classifiers in rfNumClassifiers:
        if num_classifiers not in rfAvgStats.keys():
            rfAvgStats[num_classifiers] = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}
        
        X_train, X_test =  trainFeatures[train_index], trainFeatures[test_index]
        y_train, y_test = trainTargets[train_index], trainTargets[test_index]

        start = time.time()
        
        randomForestClassifier = RandomForestClassifier(n_estimators = num_classifiers, n_jobs=-1)
        randomForestClassifier = randomForestClassifier.fit(X_train, y_train)
        end = time.time()
        rfTimes.append(end - start)
        
        predictstart = time.time()
        rfPrediction = randomForestClassifier.predict(X_test)
        predictend = time.time()
        rfPredictTimes.append(predictend - predictstart)

        predictedLabels = list(map(lambda ar: decodePurpose(ar),rfPrediction))
        targetLabels = list(map(lambda ar: decodePurpose(ar),y_test))

        rfAcc = accuracy_score(y_test,np.array(rfPrediction))
        rfStats = precision_recall_fscore_support(predictedLabels,targetLabels, average='weighted')
        
        rfAvgStats[num_classifiers]['Accuracy'].append(rfAcc)
        rfAvgStats[num_classifiers]['Precision'].append(rfStats[0])
        rfAvgStats[num_classifiers]['Recall'].append(rfStats[1])
        rfAvgStats[num_classifiers]['F1'].append(rfStats[2])

for num_classifiers in rfNumClassifiers:
    print('Num Estimators', num_classifiers)
    print('Average Precision: ', np.mean(np.array(rfAvgStats[num_classifiers]['Precision'])))
    print('Average Recall: ', np.mean(np.array(rfAvgStats[num_classifiers]['Recall'])))
    print('Average Accuracy: ', np.mean(np.array(rfAvgStats[num_classifiers]['Accuracy'])))
    print('Average F1: ', np.mean(np.array(rfAvgStats[num_classifiers]['F1'])))
    print('\n')


In [None]:
kf = KFold(n_splits=10)

svmAvgStats = {}
globalBestSVMAcc = 0
svmTimes = []
svmPredictTimes = []

svmKernelTypes = ['rbf', 'linear']
svmCValues=[0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 0.9, 1, 2, 5, 10]
for train_index, test_index in kf.split(trainFeatures):
    for kernel_type in svmKernelTypes:
        print('Kernel Type: ', kernel_type)
        for c_val in svmCValues:
            if kernel_type not in svmAvgStats.keys():
                svmAvgStats[kernel_type] = {}
            if c_val not in svmAvgStats[kernel_type].keys():
                svmAvgStats[kernel_type][c_val] = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}
            X_train, X_test =  trainFeatures[train_index], trainFeatures[test_index]
            y_train, y_test = trainTargets[train_index], trainTargets[test_index]
            start = time.time()
            svmClassifier = OneVsRestClassifier(SVC(kernel = kernel_type , C = c_val, gamma='scale'), n_jobs=-1)
            svmClassifier.fit(X_train, y_train)
            end = time.time()
            svmTimes.append(end - start)
            
            predictstart = time.time()
            svmPrediction = svmClassifier.predict(X_test)
            predictend = time.time()
            svmPredictTimes.append(predictend - predictstart)
                    
            svmPrediction = list(map(lambda singlePrediction: list(map(lambda x: int(x), singlePrediction)), svmPrediction))

            predictedLabels = list(map(lambda ar: decodePurpose(ar),svmPrediction))
            targetLabels = list(map(lambda ar: decodePurpose(ar),np.array(y_test)))

            svmAcc = accuracy_score(np.array(y_test),np.array(svmPrediction))
            svmStats = precision_recall_fscore_support(predictedLabels,targetLabels, average='weighted')
            print(svmAcc)
            print(svmStats)
            if svmAcc > globalBestSVMAcc:
                globalBestSVMAcc = svmAcc
                
            
            
            svmAvgStats[kernel_type][c_val]['Accuracy'].append(svmAcc)
            svmAvgStats[kernel_type][c_val]['Precision'].append(svmStats[0])
            svmAvgStats[kernel_type][c_val]['Recall'].append(svmStats[1])
            svmAvgStats[kernel_type][c_val]['F1'].append(svmStats[2])
            
            # print('C', c_val)
            # print('Average Precision: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Precision'])))
            # print('Average Recall: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Recall'])))
            # print('Average Accuracy: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Accuracy'])))
            # print('Average F1: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['F1'])))
            # print('\n')
            # del svmClassifier

for kernel_type in svmKernelTypes:
    print('Kernel Type: ', kernel_type)
    for c_val in svmCValues:
        print('C', c_val)
        print('Average Precision: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Precision'])))
        print('Average Recall: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Recall'])))
        print('Average Accuracy: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['Accuracy'])))
        print('Average F1: ', np.mean(np.array(svmAvgStats[kernel_type][c_val]['F1'])))
        print('\n')
    print('\n')
    


In [None]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

kf = KFold(n_splits=10)
neuralTimes = []
nueralPredictTimes = []
num_nodes = 1000
neuralAvgStats = {}
neuralLayers = [1,2,4,8]
neuralActivation = ['relu', 'tanh', 'sigmoid', 'selu']
# Adds a densely-connected layer with 64 units to the model:

for train_index, test_index in kf.split(trainFeatures):
    X_train, X_test =  trainFeatures[train_index], trainFeatures[test_index]
    y_train, y_test = trainTargets[train_index], trainTargets[test_index]
    
    for activation_function in neuralActivation:
        print('Activation Function ', activation_function)
        if activation_function not in neuralAvgStats.keys():
            neuralAvgStats[activation_function] = {}  
        for num_layer in neuralLayers:
            print('Num Layers', num_layer)
            if num_layer not in neuralAvgStats[activation_function].keys():
                neuralAvgStats[activation_function][num_layer] = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}
            start = time.time() 
            model = tf.keras.Sequential()
            model.add(layers.Dense(num_nodes, activation=activation_function, input_shape=(INPUT_SHAPE,)))
            for _ in range(num_layer):
                model.add(layers.Dense(num_nodes, activation=activation_function))
                
            model.add(layers.Dense(11, activation='softmax'))
            model.compile(optimizer=tf.keras.optimizers.RMSprop(0.000001),
                            loss='categorical_crossentropy',
                            metrics=['accuracy',f1_m,precision_m, recall_m])

            model.fit(X_train,  y_train, epochs=1500, batch_size=20, verbose=0)
            end = time.time()
            neuralTimes.append(end - start)
            
            predictstart = time.time()
            loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, np.array(y_test), batch_size=20, verbose=0)
            predictend = time.time()
            nueralPredictTimes.append(predictend - predictstart)
            neuralAvgStats[activation_function][num_layer]['Accuracy'].append(accuracy)
            neuralAvgStats[activation_function][num_layer]['Precision'].append(precision)
            neuralAvgStats[activation_function][num_layer]['Recall'].append(recall)
            neuralAvgStats[activation_function][num_layer]['F1'].append(f1_score)
            print('Accuracy: ', accuracy)
            print('Precision: ', precision)
            print('Recall: ', recall)
            print('F1 Score: ', f1_score)
            print('\n')  
    print('\n')
print('\n')

for activation_function in neuralActivation:
    print('Activation', activation_function)
    for num_layer in neuralLayers:
        print('Number of Layers: ', num_layer)
        print('Average Accuracy: ', np.mean(neuralAvgStats[activation_function][num_layer]['Accuracy'])))
        print('Average Precision: ', np.mean(np.array(neuralAvgStats[activation_function][num_layer]['Precision'])))
        print('Average Recall: ', np.mean(np.array(neuralAvgStats[activation_function][num_layer]['Recall'])))
        print('Average F1: ', np.mean(neuralAvgStats[activation_function][num_layer]['F1'])))
        print('\n')
    print('\n')
    