In [1]:
import numpy as np
import os
import json 
directoriesWithLabels = [("DA Logs Benign",0), ("DA Logs Malicious",1)]

In [14]:
def getAPIClassMethodTypeFromLog(log):
    APIdataSequence = []
    with open(log) as logFile:
        jsonLog = json.load(logFile)
        api_calls_array = "[" + jsonLog['api_calls'] + "]"
        api_calls = json.loads(api_calls_array)
        for api_call in api_calls:
            data = api_call['class']+":"+api_call['method']+":"+api_call['type']
            APIdataSequence.append(data)
    return APIdataSequence

In [15]:
data_corpus = []
labels = []
for directory, label in directoriesWithLabels:
    logs = os.listdir(directory)
    for logPath in logs:
        filePath = directory+"/"+logPath
        data_corpus.append(getAPIClassMethodTypeFromLog(filePath))
        labels.append(label)

In [17]:
print(data_corpus[0])

['android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.app.ContextImpl:registerReceiver:binder', 'android.app.ContextImpl:registerReceiver:binder', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content']


In [19]:
from sklearn.model_selection import train_test_split
corpus_train, corpus_test, y_train, y_test = train_test_split(data_corpus, labels, test_size=0.2, random_state=11)

In [22]:
import collections
from nltk import ngrams
import numpy as np

def textToNgrams(text, n):
    Ngrams = ngrams(text, n)
    return list(Ngrams)
    
def getNgramCounts(text, N):
    Ngrams = textToNgrams(text, N)
    return collections.Counter(Ngrams)

def getNGramFeaturesFromSample(file, K1_most_frequent_Ngrams_list):
    K1 = len(K1_most_frequent_Ngrams_list)
    feature_vector = K1*[0]
    fileNgrams = getNgramCounts(file, N)
    for i in range(K1):
        feature_vector[i]=fileNgrams[K1_most_frequent_Ngrams_list[i]]
    return feature_vector

In [24]:
N=4
totalNgramCount = collections.Counter([])
for file in corpus_train:
    totalNgramCount += getNgramCounts(file, N)

In [36]:
K1 = 3000
K1_most_frequent_Ngrams = totalNgramCount.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [28]:
K1_most_frequent_Ngrams_list

[('java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection'),
 ('java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime'),
 ('android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals'),
 ('libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file'),
 ('dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex'),
 ('android.util.Base64:decode:generic',
  'android.util.Base64:decode:generic',
  'android.util.Base64:decode:generic',
  'android.util.Base64:

In [37]:
def featurizeSample(file, Ngrams_list):
    K1 = len(Ngrams_list)
    feature_vector = K1*[0]
    fileNgrams = getNgramCounts(file, N)
    for i in range(K1):
        feature_vector[i]=fileNgrams[Ngrams_list[i]]
    return feature_vector

In [38]:
X_train = []
for sample in corpus_train:
    X_train.append(featurizeSample(sample, K1_most_frequent_Ngrams_list))
X_train = np.asarray(X_train)
X_test = []
for sample in corpus_test:
    X_test.append(featurizeSample(sample, K1_most_frequent_Ngrams_list))
X_test = np.asarray(X_test)

In [39]:
print(X_train.shape)
print(X_test.shape)

(6652, 3000)
(1663, 3000)


In [40]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
K2 = 500
mi_pipeline = Pipeline([('mutual_information', SelectKBest(mutual_info_classif, k=K2)),('xgb',XGBClassifier()),])

In [41]:
mi_pipeline.fit(X_train,y_train)
print("Training accuracy:")
print(mi_pipeline.score(X_train, y_train))
print("Testing accuracy:")
print(mi_pipeline.score(X_test, y_test))

Training accuracy:
0.8149428743235118
Testing accuracy:
0.8033674082982561
