In [2]:
import os
from os import listdir

directories_with_labels = [("Benign PE Samples", 0), ("Malicious PE Samples", 1)]
list_of_samples = []
labels = []
for dataset_path, label in directories_with_labels:
    samples = [f for f in listdir(dataset_path)]
    for sample in samples:
        file_path = os.path.join(dataset_path, sample)
        list_of_samples.append(file_path)
        labels.append(label)

In [3]:
from sklearn.model_selection import train_test_split

samples_train, samples_test, labels_train, labels_test = train_test_split(
    list_of_samples, labels, test_size=0.3, stratify=labels, random_state=11
)

In [4]:
import collections
from nltk import ngrams
import numpy as np
import pefile


def read_file(file_path):
    """Reads in the binary sequence of a binary file."""
    with open(file_path, "rb") as binary_file:
        data = binary_file.read()
    return data


def byte_sequence_to_Ngrams(byte_sequence, N):
    """Creates a list of N-grams from a byte sequence."""
    Ngrams = ngrams(byte_sequence, N)
    return list(Ngrams)


def binary_file_to_Ngram_counts(file, N):
    """Takes a binary file and outputs the N-grams counts of its binary sequence."""
    filebyte_sequence = read_file(file)
    file_Ngrams = byte_sequence_to_Ngrams(filebyte_sequence, N)
    return collections.Counter(file_Ngrams)


def get_NGram_features_from_sample(sample, K1_most_frequent_Ngrams_list):
    """Takes a sample and produces a feature vector.
    The features are the counts of the K1 N-grams we've selected.
    """
    K1 = len(K1_most_frequent_Ngrams_list)
    feature_vector = K1 * [0]
    file_Ngrams = binary_file_to_Ngram_counts(sample, N)
    for i in range(K1):
        feature_vector[i] = file_Ngrams[K1_most_frequent_Ngrams_list[i]]
    return feature_vector


def preprocess_imports(list_of_DLLs):
    """Normalize the naming of the imports of a PE file."""
    temp = [x.decode().split(".")[0].lower() for x in list_of_DLLs]
    return " ".join(temp)


def get_imports(pe):
    """Get a list of the imports of a PE file."""
    list_of_imports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        list_of_imports.append(entry.dll)
    return preprocess_imports(list_of_imports)


def get_section_names(pe):
    """Gets a list of section names from a PE file."""
    list_of_section_names = []
    for sec in pe.sections:
        normalized_name = sec.Name.decode().replace("\x00", "").lower()
        list_of_section_names.append(normalized_name)
    return "".join(list_of_section_names)

In [5]:
N = 2
Ngram_counts_all = collections.Counter([])
for sample in samples_train:
    Ngram_counts_all += binary_file_to_Ngram_counts(sample, N)
K1 = 100
K1_most_frequent_Ngrams = Ngram_counts_all.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [6]:
imports_corpus_train = []
num_sections_train = []
section_names_train = []
Ngram_features_list_train = []
y_train = []
for i in range(len(samples_train)):
    sample = samples_train[i]
    try:
        NGram_features = get_NGram_features_from_sample(
            sample, K1_most_frequent_Ngrams_list
        )
        pe = pefile.PE(sample)
        imports = get_imports(pe)
        n_sections = len(pe.sections)
        sec_names = get_section_names(pe)
        imports_corpus_train.append(imports)
        num_sections_train.append(n_sections)
        section_names_train.append(sec_names)
        Ngram_features_list_train.append(NGram_features)
        y_train.append(labels_train[i])
    except Exception as e:
        print(sample + ":")
        print(e)

Benign PE Samples\iisrstas.exe:
'DOS Header magic not found.'
Benign PE Samples\InspectVhdDialog6.2.exe:
'DOS Header magic not found.'
Benign PE Samples\lpr.exe:
'DOS Header magic not found.'
Benign PE Samples\iissetup.exe:
'DOS Header magic not found.'
Benign PE Samples\dsmgmt.exe:
'DOS Header magic not found.'
Benign PE Samples\evntwin.exe:
'DOS Header magic not found.'
Benign PE Samples\CCG.exe:
'DOS Header magic not found.'
Benign PE Samples\hcsdiag.exe:
'DOS Header magic not found.'
Benign PE Samples\hvsirdpclient.exe:
'DOS Header magic not found.'
Benign PE Samples\lpq.exe:
'DOS Header magic not found.'
Benign PE Samples\hvc.exe:
'DOS Header magic not found.'
Benign PE Samples\hvsimgr.exe:
'DOS Header magic not found.'
Benign PE Samples\AppVStreamingUX.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Benign PE Samples\bash.exe:
'DOS Header magic not found.'
Benign PE Samples\dsamain.exe:
'DOS Header magic not found.'
Benign PE Samples\LogCollector.exe:
'DOS Header magic

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

imports_featurizer = Pipeline(
    [
        ("vect", HashingVectorizer(input="content", ngram_range=(1, 2))),
        ("tfidf", TfidfTransformer(use_idf=True,)),
    ]
)
section_names_featurizer = Pipeline(
    [
        ("vect", HashingVectorizer(input="content", ngram_range=(1, 2))),
        ("tfidf", TfidfTransformer(use_idf=True,)),
    ]
)
imports_corpus_train_transformed = imports_featurizer.fit_transform(
    imports_corpus_train
)
section_names_train_transformed = section_names_featurizer.fit_transform(
    section_names_train
)

In [8]:
from scipy.sparse import hstack, csr_matrix

X_train = hstack(
    [
        Ngram_features_list_train,
        imports_corpus_train_transformed,
        section_names_train_transformed,
        csr_matrix(num_sections_train).transpose(),
    ]
)

In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)

In [10]:
clf.score(X_train, y_train)

1.0

In [13]:
imports_corpus_test = []
num_sections_test = []
section_names_test = []
Ngram_features_list_test = []
y_test = []
for i in range(len(samples_test)):
    file = samples_test[i]
    try:
        NGram_features = get_NGram_features_from_sample(
            sample, K1_most_frequent_Ngrams_list
        )
        pe = pefile.PE(file)
        imports = get_imports(pe)
        n_sections = len(pe.sections)
        sec_names = get_section_names(pe)
        imports_corpus_test.append(imports)
        num_sections_test.append(n_sections)
        section_names_test.append(sec_names)
        Ngram_features_list_test.append(NGram_features)
        y_test.append(labels_test[i])
    except Exception as e:
        print(sample + ":")
        print(e)

Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'utf-8' codec can't decode byte 0xd2 in position 6: invalid continuation byte
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'
Benign PE Samples\findstr.exe:
'DOS Header magic not found.'


In [14]:
imports_corpus_test_transformed = imports_featurizer.transform(imports_corpus_test)
section_names_test_transformed = section_names_featurizer.transform(section_names_test)
X_test = hstack(
    [
        Ngram_features_list_test,
        imports_corpus_test_transformed,
        section_names_test_transformed,
        csr_matrix(num_sections_test).transpose(),
    ]
)

In [15]:
clf.score(X_test, y_test)

0.8859649122807017