In [25]:
import numpy as np
import os
import json

directories_with_labels = [("DA Logs Benign", 0), ("DA Logs Malware", 1)]

In [26]:
def get_API_class_method_type_from_log(log):
    """Parses out API calls from behavioral logs."""
    API_data_sequence = []
    with open(log) as log_file:
        json_log = json.load(log_file)
        api_calls_array = "[" + json_log["api_calls"] + "]"
        api_calls = json.loads(api_calls_array)
        for api_call in api_calls:
            data = api_call["class"] + ":" + api_call["method"] + ":" + api_call["type"]
            API_data_sequence.append(data)
    return API_data_sequence

In [27]:
data_corpus = []
labels = []
for directory, label in directories_with_labels:
    logs = os.listdir(directory)
    for log_path in logs:
        file_path = directory + "/" + log_path
        try:
            data_corpus.append(get_API_class_method_type_from_log(file_path))
            labels.append(label)
        except:
            pass

In [28]:
print(data_corpus[0])

['android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content', 'android.app.ContextImpl:registerReceiver:binder', 'android.app.ContextImpl:registerReceiver:binder', 'android.os.SystemProperties:get:content', 'android.os.SystemProperties:get:content']


In [29]:
from sklearn.model_selection import train_test_split

corpus_train, corpus_test, y_train, y_test = train_test_split(
    data_corpus, labels, test_size=0.2, random_state=11
)

In [30]:
import collections
from nltk import ngrams
import numpy as np


def read_file(file_path):
    """Reads in the binary sequence of a binary file."""
    with open(file_path, "rb") as binary_file:
        data = binary_file.read()
    return data


def text_to_Ngrams(text, n):
    """Produces a list of N-grams from a text."""
    Ngrams = ngrams(text, n)
    return list(Ngrams)


def get_Ngram_counts(text, N):
    """Get a frequency count of N-grams in a text."""
    Ngrams = text_to_Ngrams(text, N)
    return collections.Counter(Ngrams)

In [31]:
N = 4
total_Ngram_count = collections.Counter([])
for file in corpus_train:
    total_Ngram_count += get_Ngram_counts(file, N)

In [32]:
K1 = 3000
K1_most_frequent_Ngrams = total_Ngram_count.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [33]:
K1_most_frequent_Ngrams_list

[('java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection',
  'java.lang.reflect.Method:invoke:reflection'),
 ('java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime',
  'java.io.FileInputStream:read:runtime'),
 ('android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals',
  'android.content.ContentValues:put:globals'),
 ('libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file',
  'libcore.io.IoBridge:open:file'),
 ('dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex',
  'dalvik.system.DexFile:loadClass:dex'),
 ('android.util.Base64:decode:generic',
  'android.util.Base64:decode:generic',
  'android.util.Base64:decode:generic',
  'android.util.Base64:

In [34]:
def featurize_sample(file, Ngrams_list):
    """Takes a sample and produces a feature vector.
    The features are the counts of the K1 N-grams we've selected.
    """
    K1 = len(Ngrams_list)
    feature_vector = K1 * [0]
    fileNgrams = get_Ngram_counts(file, N)
    for i in range(K1):
        feature_vector[i] = fileNgrams[Ngrams_list[i]]
    return feature_vector

In [35]:
X_train = []
for sample in corpus_train:
    X_train.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
X_train = np.asarray(X_train)
X_test = []
for sample in corpus_test:
    X_test.append(featurize_sample(sample, K1_most_frequent_Ngrams_list))
X_test = np.asarray(X_test)

In [36]:
print(X_train.shape)
print(X_test.shape)

(6652, 3000)
(1663, 3000)


In [37]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

K2 = 500
mi_pipeline = Pipeline(
    [
        ("mutual_information", SelectKBest(mutual_info_classif, k=K2)),
        ("xgb", XGBClassifier()),
    ]
)

In [38]:
mi_pipeline.fit(X_train, y_train)
print("Training accuracy:")
print(mi_pipeline.score(X_train, y_train))
print("Testing accuracy:")
print(mi_pipeline.score(X_test, y_test))

Training accuracy:
0.8131389055923031
Testing accuracy:
0.8033674082982561
