In [1]:
import os
directoriesWithLabels = [("Benign PE Samples",0), ("Malicious PE Samples",1)]
listOfSamples = []
labels = []
for datasetPath, label in directoriesWithLabels:
    samples = [f for f in listdir(datasetPath)]
    for sample in samples:
        filePath = os.path.join(datasetPath, sample)
        listOfSamples.append(filePath)
        labels.append(label)

In [2]:
from sklearn.model_selection import train_test_split
samples_train, samples_test, labels_train, labels_test = train_test_split(listOfSamples, labels, test_size=0.3, stratify=labels, random_state=11)

In [3]:
import collections
from nltk import ngrams
import numpy as np
import pefile

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

def byteSequenceToNgrams(byteSequence, N):
    Ngrams = ngrams(byteSequence, N)
    return list(Ngrams)

def binaryFileToNgramCounts(sample, N):
    fileByteSequence = readFile(sample)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

def getNGramFeaturesFromSample(file, K1_most_frequent_Ngrams_list):
    K1 = len(K1_most_frequent_Ngrams_list)
    fv = K1*[0]
    fileNgrams = binaryFileToNgramCounts(sample, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_frequent_Ngrams_list[i]]
    return fv

def preprocessImports(listOfDLLs):
    processedListOfDLLs = []
    tempDllList = [x.decode().split(".")[0].lower() for x in listOfDLLs]
    return " ".join(tempDllList)

def getImports(pe):
    listOfImports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        listOfImports.append(entry.dll)
    return preprocessImports(listOfImports)

def getSectionNames(pe):
    listOfSectionNames = []
    for sec in pe.sections:
        normalized_name = sec.Name.decode().replace('\x00','').lower()
        listOfSectionNames.append(normalized_name)
    return " ".join(listOfSectionNames)

In [4]:
N=2
NgramCountsAll = collections.Counter([])
for sample in samples_train:
    NgramCountsAll += binaryFileToNgramCounts(sample, N)
K1 = 100
K1_most_frequent_Ngrams = NgramCountsAll.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [5]:
importsCorpus_train = []
numSections_train = []
sectionNames_train = []
NgramFeaturesList_train = []
y_train = []
for i in range(len(samples_train)):
    sample = samples_train[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(sample, K1_most_frequent_Ngrams_list)
        pe = pefile.PE(sample)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_train.append(imports)
        numSections_train.append(nSections)
        sectionNames_train.append(secNames)
        NgramFeaturesList_train.append(NGramFeatures)
        y_train.append(labels_train[i])
    except Exception as e: 
        print(sample+":")
        print(e)

Benign PE Samples\lpr.exe:
'DOS Header magic not found.'
Malicious PE Samples\Build.exe:
'utf-8' codec can't decode byte 0xd2 in position 6: invalid continuation byte
Benign PE Samples\ADSchemaAnalyzer.exe:
'DOS Header magic not found.'
Benign PE Samples\evntwin.exe:
'DOS Header magic not found.'
Benign PE Samples\dplaysvr.exe:
'DOS Header magic not found.'
Benign PE Samples\cmak.exe:
'DOS Header magic not found.'
Benign PE Samples\BootExpCfg.exe:
'DOS Header magic not found.'
Benign PE Samples\dsdbutil.exe:
'DOS Header magic not found.'
Benign PE Samples\InetMgr.exe:
'DOS Header magic not found.'
Benign PE Samples\lpq.exe:
'DOS Header magic not found.'
Benign PE Samples\ldp.exe:
'DOS Header magic not found.'
Benign PE Samples\hvsimgr.exe:
'DOS Header magic not found.'
Malicious PE Samples\malware.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Benign PE Samples\eshell.exe:
'DOS Header magic not found.'
Benign PE Samples\InspectVhdDialog6.3.exe:
'DOS Header magic not found.'

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
imports_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
section_names_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
importsCorpus_train_transformed = imports_featurizer.fit_transform(importsCorpus_train)
sectionNames_train_transformed = section_names_featurizer.fit_transform(sectionNames_train)

In [8]:
from scipy.sparse import hstack, csr_matrix
X_train = hstack([NgramFeaturesList_train, importsCorpus_train_transformed,sectionNames_train_transformed, csr_matrix(numSections_train).transpose()])

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train,y_train)

In [10]:
clf.score(X_train, y_train)

1.0

In [11]:
importsCorpus_test = []
numSections_test = []
sectionNames_test = []
NgramFeaturesList_test = []
y_test = []
for i in range(len(samples_test)):
    sample = samples_test[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(sample, K1_most_frequent_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_test.append(imports)
        numSections_test.append(nSections)
        sectionNames_test.append(secNames)
        NgramFeaturesList_test.append(NGramFeatures)
        y_test.append(labels_test[i])
    except Exception as e: 
        print(sample+":")
        print(e)

Benign PE Samples\InspectVhdDialog.exe:
'DOS Header magic not found.'
Benign PE Samples\inetinfo.exe:
'DOS Header magic not found.'
Benign PE Samples\aspnetca.exe:
'DOS Header magic not found.'
Benign PE Samples\dsmgmt.exe:
'DOS Header magic not found.'
Benign PE Samples\dcdiag.exe:
'DOS Header magic not found.'
Benign PE Samples\iisrstas.exe:
'DOS Header magic not found.'
Benign PE Samples\bash.exe:
'DOS Header magic not found.'
Benign PE Samples\LxRun.exe:
'DOS Header magic not found.'
Benign PE Samples\iissetup.exe:
'DOS Header magic not found.'


In [12]:
importsCorpus_test_transformed = imports_featurizer.transform(importsCorpus_test)
sectionNames_test_transformed = section_names_featurizer.transform(sectionNames_test)
X_test = hstack([NgramFeaturesList_test, importsCorpus_test_transformed,sectionNames_test_transformed, csr_matrix(numSections_test).transpose()])

In [13]:
clf.score(X_test, y_test)

0.9236641221374046