In [3]:
from os import listdir
from nltk import ngrams
import hashlib

directories = ["Benign PE Samples", "Malicious PE Samples"]
N = 2

In [4]:
def read_file(file_path):
    """Reads in the binary sequence of a binary file."""
    with open(file_path, "rb") as binary_file:
        data = binary_file.read()
    return data


def byte_sequence_to_Ngrams(byte_sequence, N):
    """Creates a list of N-grams from a byte sequence."""
    return ngrams(byte_sequence, N)


def hash_input(inp):
    """Compute the MD5 hash of an input."""
    return int(hashlib.md5(inp).hexdigest(), 16)


def make_ngram_hashable(Ngram):
    """Convert N-gram into bytes to be hashable."""
    return bytes(Ngram)


def hash_file_Ngrams_into_dictionary(file_Ngrams, T):
    """Hashes N-grams in a list and then keeps track of the counts in a dictionary."""
    for Ngram in file_Ngrams:
        hashable_Ngram = make_ngram_hashable(Ngram)
        hashed_and_reduced = hash_input(hashable_Ngram) % B
        T[hashed_and_reduced] = T.get(hashed_and_reduced, 0) + 1

In [5]:
B = 65521
T = {}
for dataset_path in directories:
    samples = [f for f in listdir(dataset_path)]
    for file in samples:
        file_path = dataset_path + "/" + file
        file_byte_sequence = read_file(file_path)
        file_Ngrams = byte_sequence_to_Ngrams(file_byte_sequence, N)
        hash_file_Ngrams_into_dictionary(file_Ngrams, T)
K1 = 1000
import heapq

K1_most_common_Ngrams_Using_Hash_Grams = heapq.nlargest(K1, T)

In [6]:
def featurize_sample(file, K1_most_common_Ngrams_Using_Hash_Grams):
    """Takes a sample and produces a feature vector.
    The features are the counts of the K1 N-grams we've selected.
    """
    K1 = len(K1_most_common_Ngrams_Using_Hash_Grams)
    fv = K1 * [0]
    file_byte_sequence = read_file(file_path)
    file_Ngrams = byte_sequence_to_Ngrams(file_byte_sequence, N)
    for Ngram in file_Ngrams:
        hashable_Ngram = make_ngram_hashable(Ngram)
        hashed_and_reduced = hash_input(hashable_Ngram) % B
        if hashed_and_reduced in K1_most_common_Ngrams_Using_Hash_Grams:
            index = K1_most_common_Ngrams_Using_Hash_Grams.index(hashed_and_reduced)
            fv[index] += 1
    return fv

In [7]:
X = []
for dataset_path in directories:
    samples = [f for f in listdir(dataset_path)]
    for file in samples:
        file_path = dataset_path + "/" + file
        X.append(featurize_sample(file_path, K1_most_common_Ngrams_Using_Hash_Grams))

In [8]:
X[0]

[3,
 1,
 3,
 6,
 3,
 2,
 7,
 4,
 2,
 5,
 0,
 6,
 1,
 5,
 8,
 1,
 1,
 0,
 5,
 3,
 3,
 10,
 3,
 4,
 0,
 4,
 2,
 3,
 0,
 2,
 4,
 5,
 6,
 7,
 4,
 3,
 3,
 11,
 2,
 1,
 4,
 1,
 0,
 4,
 4,
 3,
 3,
 5,
 5,
 3,
 6,
 1,
 4,
 0,
 3,
 2,
 5,
 3,
 0,
 2,
 2,
 5,
 4,
 2,
 2,
 9,
 8,
 4,
 3,
 1,
 4,
 2,
 9,
 3,
 1,
 5,
 3,
 3,
 3,
 3,
 13,
 3,
 1,
 1,
 1,
 4,
 2,
 6,
 6,
 4,
 5,
 7,
 1,
 2,
 0,
 2,
 3,
 0,
 2,
 5,
 2,
 6,
 1,
 4,
 1,
 4,
 2,
 8,
 3,
 0,
 1,
 5,
 1,
 2,
 4,
 8,
 2,
 0,
 6,
 8,
 2,
 6,
 4,
 0,
 5,
 6,
 1,
 6,
 2,
 4,
 1,
 4,
 1,
 3,
 7,
 9,
 4,
 3,
 8,
 0,
 5,
 4,
 2,
 2,
 2,
 0,
 8,
 2,
 0,
 3,
 0,
 1,
 3,
 2,
 1,
 1,
 1,
 6,
 6,
 1,
 4,
 13,
 4,
 5,
 5,
 3,
 3,
 2,
 1,
 7,
 7,
 2,
 1,
 5,
 0,
 3,
 1,
 1,
 2,
 3,
 3,
 2,
 2,
 2,
 3,
 11,
 4,
 2,
 1,
 4,
 4,
 3,
 5,
 5,
 1,
 5,
 6,
 1,
 4,
 5,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 6,
 1,
 9,
 3,
 1,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 1,
 3,
 3,
 0,
 8,
 3,
 0,
 4,
 0,
 6,
 3,
 4,
 1,
 2,
 3,
 3,
 4,
 4,
 1,
 3,
 3,
 1,
 7,
 4,
 1,
 3,
 1,