In [28]:
import pickle
import pefile
import os
import pandas as pd
import joblib
import numpy
import sklearn.ensemble as ek
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression

In [29]:
def handleClassifier():
    dataset = pd.read_csv(r"data.csv", sep = '|')
    dataset.groupby(dataset['Safe']).size()

    X = dataset.drop(['Name','md5','Safe'],axis=1).values
    y = dataset['Safe'].values
    extratrees = ek.ExtraTreesClassifier().fit(X, y)
    model = SelectFromModel(extratrees, prefit = True)
    new_X = model.transform(X)

    listFeatures = new_X.shape[1]

    X_train, X_test, y_train, y_test = train_test_split(new_X, y ,test_size=0.2)

    features = []
    index = numpy.argsort(extratrees.feature_importances_)[::-1][:listFeatures]

    for f in range(listFeatures):

        features.append(dataset.columns[2+f])



    model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
             "RandomForest":ek.RandomForestClassifier(n_estimators=50),
             "Adaboost":ek.AdaBoostClassifier(n_estimators=50),
             "GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
             "GNB":GaussianNB(),
             "LinearRegression":LinearRegression()   
    }



    results = {}
    for algo in model:
        clf = model[algo]
        clf.fit(X_train,y_train)
        score = clf.score(X_test,y_test)
        results[algo] = score


    winner = max(results, key=results.get)

    joblib.dump(model[winner],'classifier/classifier.pkl')
    open('classifier/features.pkl', 'wb').write(pickle.dumps(features))

    clf = model[winner]
    res = clf.predict(new_X)
    mt = confusion_matrix(y, res)
    
handleClassifier()

                                                   Name  \
0     002ce0d28ec990aadbbc89df457189de37d8adaadc9c08...   
1     003851675800dc05cdac1baa84cab8f68534b244906d97...   
2     00eea85752664955047caad7d6280bc7bf1ab91c61eb9a...   
3     0111bddac92a792c7b2ee3ab77642c33df0e01afe737b0...   
4     016584e586de67b725ac1e3974fcca320bf81c8c489ebb...   
...                                                 ...   
1557        VirusShare_ff908265ebe205d880616e7d9bd408be   
1558        VirusShare_ffb456a28adf28a05af5746f996a96dc   
1559        VirusShare_ffbad1435c5f4b65f1470aeb2ae70d4e   
1560        VirusShare_ffe922f1f2235ab8239616489022ff32   
1561        VirusShare_fff3093fe6a763b588b51479f6f52a24   

                                   md5  Machine  SizeOfOptionalHeader  \
0     ebd0e35ecce5c00d31c641690fc77d10    34404                   240   
1     c997318e5def9e2386a95f1763bf4cba    34404                   240   
2     79cd4997890e99e4c6bb9d5c2d29dac6    34404                   240   

In [30]:
import pefile
import os
import array
import math
import pickle
import joblib
import sys
import argparse



def get_entropy(data):
    if len(data) == 0:
        return 0.0
    occurences = array.array('L', [0]*256)
    for x in data:
        occurences[x if isinstance(x, int) else ord(x)] += 1
    entropy = 0
    for x in occurences:
        if x:
            p_x = float(x) / len(data)
            entropy -= p_x*math.log(p_x, 2)
    return entropy


def get_resources(pe):

    resources = []
    if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
        try:
            for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                if hasattr(resource_type, 'directory'):
                    for resource_id in resource_type.directory.entries:
                        if hasattr(resource_id, 'directory'):
                            for resource_lang in resource_id.directory.entries:
                                data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
                                size = resource_lang.data.struct.Size
                                entropy = get_entropy(data)

                                resources.append([entropy, size])
        except Exception as e:
            return resources
    return resources

def get_version_info(pe):
    res = {}
    for fileinfo in pe.FileInfo:
        if fileinfo.Key == 'StringFileInfo':
            for st in fileinfo.StringTable:
                for entry in st.entries.items():
                    res[entry[0]] = entry[1]
        if fileinfo.Key == 'VarFileInfo':
            for var in fileinfo.Var:
                res[var.entry.items()[0][0]] = var.entry.items()[0][1]
    if hasattr(pe, 'VS_FIXEDFILEINFO'):
        res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
        res['os'] = pe.VS_FIXEDFILEINFO.FileOS
        res['type'] = pe.VS_FIXEDFILEINFO.FileType
        res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
        res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
        res['signature'] = pe.VS_FIXEDFILEINFO.Signature
        res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
    return res

#extract the info for a given file
def extract_infos(fpath):
    res = {}
    pe = pefile.PE(fpath)
    res['Machine'] = pe.FILE_HEADER.Machine
    res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
    res['Characteristics'] = pe.FILE_HEADER.Characteristics
    res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
    res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
    res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
    res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
    res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
    res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
    res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
    try:
        res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
    except AttributeError:
        res['BaseOfData'] = 0
    res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
    res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
    res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
    res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
    res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
    res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
    res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
    res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
    res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
    res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
    res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
    res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
    res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
    res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
    res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
    res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
    res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
    res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
    res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
    res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes

    # Sections
    res['SectionsNb'] = len(pe.sections)
    entropy = list(map(lambda x:x.get_entropy(), pe.sections))
    res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
    res['SectionsMinEntropy'] = min(entropy)
    res['SectionsMaxEntropy'] = max(entropy)
    raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
    res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
    res['SectionsMinRawsize'] = min(raw_sizes)
    res['SectionsMaxRawsize'] = max(raw_sizes)   
    virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
    res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
    res['SectionsMinVirtualsize'] = min(virtual_sizes)
    res['SectionMaxVirtualsize'] = max(virtual_sizes)

    #Imports
    try:
        res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
        imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
        res['ImportsNb'] = len(imports)
        res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
    except AttributeError:
        res['ImportsNbDLL'] = 0
        res['ImportsNb'] = 0
        res['ImportsNbOrdinal'] = 0

    #Exports
    try:
        res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
    except AttributeError:
        # No export
        res['ExportNb'] = 0
    #Resources
    resources= get_resources(pe)
    res['ResourcesNb'] = len(resources)
    if len(resources)> 0:
        entropy = list(map(lambda x:x[0], resources))
        res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
        res['ResourcesMinEntropy'] = min(entropy)
        res['ResourcesMaxEntropy'] = max(entropy)  
        sizes = list(map(lambda x:x[1], resources))
        res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
        res['ResourcesMinSize'] = min(sizes)
        res['ResourcesMaxSize'] = max(sizes)
    else:
        res['ResourcesNb'] = 0
        res['ResourcesMeanEntropy'] = 0
        res['ResourcesMinEntropy'] = 0
        res['ResourcesMaxEntropy'] = 0
        res['ResourcesMeanSize'] = 0
        res['ResourcesMinSize'] = 0
        res['ResourcesMaxSize'] = 0

    # Load configuration size
    try:
        res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
    except AttributeError:
        res['LoadConfigurationSize'] = 0


    # Version configuration size
    try:
        version_infos = get_version_info(pe)
        res['VersionInformationSize'] = len(version_infos.keys())
    except AttributeError:
        res['VersionInformationSize'] = 0
    return res

In [49]:
def ScanVirus(path):
    try:
        clf = joblib.load('classifier/classifier.pkl')
        features = pickle.loads(open(os.path.join('classifier/features.pkl'),'rb').read())
        data = extract_infos(path)
        pe_features = list(map(lambda x:data[x], features))

        res= clf.predict([pe_features])[0]    
        
        return res
    except Exception as exc:
        print("Exception: ", exc)
        return 0

In [52]:
print(ScanVirus("E:\\VIRUS\\VirusShare_0ca7768defed156842bc7d42030054ff"))

0


In [60]:
print(ScanVirus('E:\\Malware_detective\\SAFE2\\1.exe'))

1


In [61]:
print(ScanVirus("E:\\VIRUS\\VirusShare_d21213c67dd84d3630df1fb58e1aa63f"))

Exception:  'DOS Header magic not found.'
0


In [69]:
print(ScanVirus("E:\\Malware_detective\\VIRUS_EXTREME_DANGEROUS!!!\\VirusShare_da66b3d1c3d02d8d4787dfc844f8e232"))

0


In [70]:
print(ScanVirus("E:\\Malware_detective\\SAFE\\1b399f61e57266427a5fceb4f3df6a337304832e24579c43385b59cfd660e068.exe"))

1
