In [None]:
#  download tập dataset Ember
!wget https://ember.elastic.co/ember_dataset_2017_2.tar.bz2 --no-check-certificate

In [None]:
# Decompressing a .bz2 file
!bzip2 -d ember_dataset_2017_2.tar.bz2

In [None]:
# Extracting from tar file
!tar -xvf ember_dataset_2017_2.tar

In [4]:
pip install lief

Collecting lief
  Downloading lief-0.11.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[K     |████████████████████████████████| 3.9 MB 5.3 MB/s 
[?25hInstalling collected packages: lief
Successfully installed lief-0.11.5


In [5]:

import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher

LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)


class FeatureType(object):
    ''' Base class from which each feature type may inherit '''

    name = ''
    dim = 0

    def __repr__(self):
        return '{}({})'.format(self.name, self.dim)

    def raw_features(self, bytez, lief_binary):
        ''' Generate a JSON-able representation of the file '''
        raise (NotImplementedError)

    def process_raw_features(self, raw_obj):
        ''' Generate a feature vector from the raw features '''
        raise (NotImplementedError)

    def feature_vector(self, bytez, lief_binary):
        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
        if there are significant speedups to be gained from combining the two functions. '''
        return self.process_raw_features(self.raw_features(bytez, lief_binary))


class ByteHistogram(FeatureType):
    ''' Byte histogram (count + non-normalized) over the entire binary file '''

    name = 'histogram'
    dim = 256

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
        return counts.tolist()

    def process_raw_features(self, raw_obj):
        counts = np.array(raw_obj, dtype=np.float32)
        sum = counts.sum()
        normalized = counts / sum
        return normalized


class ByteEntropyHistogram(FeatureType):
    ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
    This roughly approximates the joint probability of byte value and local entropy.
    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
    '''

    name = 'byteentropy'
    dim = 256

    def __init__(self, step=1024, window=2048):
        super(FeatureType, self).__init__()
        self.window = window
        self.step = step

    def _entropy_bin_counts(self, block):
        # coarse histogram, 16 bytes per bin
        c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
        p = c.astype(np.float32) / self.window
        wh = np.where(c)[0]
        H = np.sum(-p[wh] * np.log2(
            p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)

        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
        if Hbin == 16:  # handle entropy = 8.0 bits
            Hbin = 15

        return Hbin, c

    def raw_features(self, bytez, lief_binary):
        output = np.zeros((16, 16), dtype=np.int)
        a = np.frombuffer(bytez, dtype=np.uint8)
        if a.shape[0] < self.window:
            Hbin, c = self._entropy_bin_counts(a)
            output[Hbin, :] += c
        else:
            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
            strides = a.strides + (a.strides[-1],)
            blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]

            # from the blocks, compute histogram
            for block in blocks:
                Hbin, c = self._entropy_bin_counts(block)
                output[Hbin, :] += c

        return output.flatten().tolist()

    def process_raw_features(self, raw_obj):
        counts = np.array(raw_obj, dtype=np.float32)
        sum = counts.sum()
        normalized = counts / sum
        return normalized


class SectionInfo(FeatureType):
    ''' Information about section names, sizes and entropy.  Uses hashing trick
    to summarize all this section info into a feature vector.
    '''

    name = 'section'
    dim = 5 + 50 + 50 + 50 + 50 + 50

    def __init__(self):
        super(FeatureType, self).__init__()

    @staticmethod
    def _properties(s):
        return [str(c).split('.')[-1] for c in s.characteristics_lists]

    def raw_features(self, bytez, lief_binary):
        if lief_binary is None:
            return {"entry": "", "sections": []}

        # properties of entry point, or if invalid, the first executable section
        try:
            entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
        except lief.not_found:
            # bad entry point, let's find the first executable section
            entry_section = ""
            for s in lief_binary.sections:
                if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
                    entry_section = s.name
                    break

        raw_obj = {"entry": entry_section}
        raw_obj["sections"] = [{
            'name': s.name,
            'size': s.size,
            'entropy': s.entropy,
            'vsize': s.virtual_size,
            'props': self._properties(s)
        } for s in lief_binary.sections]
        return raw_obj

    def process_raw_features(self, raw_obj):
        sections = raw_obj['sections']
        general = [
            len(sections),  # total number of sections
            # number of sections with nonzero size
            sum(1 for s in sections if s['size'] == 0),
            # number of sections with an empty name
            sum(1 for s in sections if s['name'] == ""),
            # number of RX
            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
            # number of W
            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
        ]
        # gross characteristics of each section
        section_sizes = [(s['name'], s['size']) for s in sections]
        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
        section_entropy = [(s['name'], s['entropy']) for s in sections]
        section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
        section_vsize = [(s['name'], s['vsize']) for s in sections]
        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
        entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
        characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
        characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]

        return np.hstack([
            general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
            characteristics_hashed
        ]).astype(np.float32)


class ImportsInfo(FeatureType):
    ''' Information about imported libraries and functions from the
    import address table.  Note that the total number of imported
    functions is contained in GeneralFileInfo.
    '''

    name = 'imports'
    dim = 1280

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        imports = {}
        if lief_binary is None:
            return imports

        for lib in lief_binary.imports:
            if lib.name not in imports:
                imports[lib.name] = []  # libraries can be duplicated in listing, extend instead of overwrite

            # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
            #  beyond the first 10000 characters, and this will help limit the dataset size
            for entry in lib.entries:
                if entry.is_ordinal:
                    imports[lib.name].append("ordinal" + str(entry.ordinal))
                else:
                    imports[lib.name].append(entry.name[:10000])

        return imports

    def process_raw_features(self, raw_obj):
        # unique libraries
        libraries = list(set([l.lower() for l in raw_obj.keys()]))
        libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]

        # A string like "kernel32.dll:CreateFileMappingA" for each imported function
        imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
        imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]

        # Two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)


class ExportsInfo(FeatureType):
    ''' Information about exported functions. Note that the total number of exported
    functions is contained in GeneralFileInfo.
    '''

    name = 'exports'
    dim = 128

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        if lief_binary is None:
            return []

        # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
        #  the first 10000 characters, and this will help limit the dataset size
        if LIEF_EXPORT_OBJECT:
            # export is an object with .name attribute (0.10.0 and later)
            clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
        else:
            # export is a string (LIEF 0.9.0 and earlier)
            clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
        

        return clipped_exports

    def process_raw_features(self, raw_obj):
        exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
        return exports_hashed.astype(np.float32)


class GeneralFileInfo(FeatureType):
    ''' General information about the file '''

    name = 'general'
    dim = 10

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        if lief_binary is None:
            return {
                'size': len(bytez),
                'vsize': 0,
                'has_debug': 0,
                'exports': 0,
                'imports': 0,
                'has_relocations': 0,
                'has_resources': 0,
                'has_signature': 0,
                'has_tls': 0,
                'symbols': 0
            }

        return {
            'size': len(bytez),
            'vsize': lief_binary.virtual_size,
            'has_debug': int(lief_binary.has_debug),
            'exports': len(lief_binary.exported_functions),
            'imports': len(lief_binary.imported_functions),
            'has_relocations': int(lief_binary.has_relocations),
            'has_resources': int(lief_binary.has_resources),
            'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
            'has_tls': int(lief_binary.has_tls),
            'symbols': len(lief_binary.symbols),
        }

    def process_raw_features(self, raw_obj):
        return np.asarray([
            raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
            raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
            raw_obj['symbols']
        ],
                          dtype=np.float32)


class HeaderFileInfo(FeatureType):
    ''' Machine, architecure, OS, linker and other information extracted from header '''

    name = 'header'
    dim = 62

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        raw_obj = {}
        raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
        raw_obj['optional'] = {
            'subsystem': "",
            'dll_characteristics': [],
            'magic': "",
            'major_image_version': 0,
            'minor_image_version': 0,
            'major_linker_version': 0,
            'minor_linker_version': 0,
            'major_operating_system_version': 0,
            'minor_operating_system_version': 0,
            'major_subsystem_version': 0,
            'minor_subsystem_version': 0,
            'sizeof_code': 0,
            'sizeof_headers': 0,
            'sizeof_heap_commit': 0
        }
        if lief_binary is None:
            return raw_obj

        raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
        raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
        raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
        raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
        raw_obj['optional']['dll_characteristics'] = [
            str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
        ]
        raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
        raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
        raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
        raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
        raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
        raw_obj['optional'][
            'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
        raw_obj['optional'][
            'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
        raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
        raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
        raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
        raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
        raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
        return raw_obj

    def process_raw_features(self, raw_obj):
        return np.hstack([
            raw_obj['coff']['timestamp'],
            FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
            FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
            FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
            raw_obj['optional']['major_image_version'],
            raw_obj['optional']['minor_image_version'],
            raw_obj['optional']['major_linker_version'],
            raw_obj['optional']['minor_linker_version'],
            raw_obj['optional']['major_operating_system_version'],
            raw_obj['optional']['minor_operating_system_version'],
            raw_obj['optional']['major_subsystem_version'],
            raw_obj['optional']['minor_subsystem_version'],
            raw_obj['optional']['sizeof_code'],
            raw_obj['optional']['sizeof_headers'],
            raw_obj['optional']['sizeof_heap_commit'],
        ]).astype(np.float32)


class StringExtractor(FeatureType):
    ''' Extracts strings from raw byte stream '''

    name = 'strings'
    dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1

    def __init__(self):
        super(FeatureType, self).__init__()
        # all consecutive runs of 0x20 - 0x7f that are 5+ characters
        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
        # occurances of the string 'C:\'.  Not actually extracting the path
        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
        # occurances of http:// or https://.  Not actually extracting the URLs
        self._urls = re.compile(b'https?://', re.IGNORECASE)
        # occurances of the string prefix HKEY_.  No actually extracting registry names
        self._registry = re.compile(b'HKEY_')
        # crude evidence of an MZ header (dropper?) somewhere in the byte stream
        self._mz = re.compile(b'MZ')

    def raw_features(self, bytez, lief_binary):
        allstrings = self._allstrings.findall(bytez)
        if allstrings:
            # statistics about strings:
            string_lengths = [len(s) for s in allstrings]
            avlength = sum(string_lengths) / len(string_lengths)
            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
            as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
            # distribution of characters in printable strings
            csum = c.sum()
            p = c.astype(np.float32) / csum
            wh = np.where(c)[0]
            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
        else:
            avlength = 0
            c = np.zeros((96,), dtype=np.float32)
            H = 0
            csum = 0

        return {
            'numstrings': len(allstrings),
            'avlength': avlength,
            'printabledist': c.tolist(),  # store non-normalized histogram
            'printables': int(csum),
            'entropy': float(H),
            'paths': len(self._paths.findall(bytez)),
            'urls': len(self._urls.findall(bytez)),
            'registry': len(self._registry.findall(bytez)),
            'MZ': len(self._mz.findall(bytez))
        }

    def process_raw_features(self, raw_obj):
        hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
        return np.hstack([
            raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
            np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
            raw_obj['registry'], raw_obj['MZ']
        ]).astype(np.float32)


class DataDirectories(FeatureType):
    ''' Extracts size and virtual address of the first 15 data directories '''

    name = 'datadirectories'
    dim = 15 * 2

    def __init__(self):
        super(FeatureType, self).__init__()
        self._name_order = [
            "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
            "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
            "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
        ]

    def raw_features(self, bytez, lief_binary):
        output = []
        if lief_binary is None:
            return output

        for data_directory in lief_binary.data_directories:
            output.append({
                "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
                "size": data_directory.size,
                "virtual_address": data_directory.rva
            })
        return output

    def process_raw_features(self, raw_obj):
        features = np.zeros(2 * len(self._name_order), dtype=np.float32)
        for i in range(len(self._name_order)):
            if i < len(raw_obj):
                features[2 * i] = raw_obj[i]["size"]
                features[2 * i + 1] = raw_obj[i]["virtual_address"]
        return features


class PEFeatureExtractor(object):
    ''' Extract useful features from a PE file, and return as a vector of fixed size. '''

    def __init__(self, feature_version=2, print_feature_warning=True):
        self.features = [
            ByteHistogram(),
            ByteEntropyHistogram(),
            StringExtractor(),
            GeneralFileInfo(),
            HeaderFileInfo(),
            SectionInfo(),
            ImportsInfo(),
            ExportsInfo()
        ]
        if feature_version == 1:
            if not lief.__version__.startswith("0.8.3"):
                if print_feature_warning:
                    print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
                    print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
                    print(f"WARNING:   in the feature calculations.")
        elif feature_version == 2:
            self.features.append(DataDirectories())
            if not lief.__version__.startswith("0.9.0"):
                if print_feature_warning:
                    print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
                    print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
                    print(f"WARNING:   in the feature calculations.")
        else:
            raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
        self.dim = sum([fe.dim for fe in self.features])

    def raw_features(self, bytez):
        lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
                       RuntimeError)
        try:
            lief_binary = lief.PE.parse(list(bytez))
        except lief_errors as e:
            print("lief error: ", str(e))
            lief_binary = None
        except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
            raise

        features = {"sha256": hashlib.sha256(bytez).hexdigest()}
        features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
        return features

    def process_raw_features(self, raw_obj):
        feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
        return np.hstack(feature_vectors).astype(np.float32)

    def feature_vector(self, bytez):
        return self.process_raw_features(self.raw_features(bytez))

In [None]:
import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher

LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)


class FeatureType(object):
    ''' Base class from which each feature type may inherit '''

    name = ''
    dim = 0

    def __repr__(self):
        return '{}({})'.format(self.name, self.dim)

    def raw_features(self, bytez, lief_binary):
        ''' Generate a JSON-able representation of the file '''
        raise (NotImplementedError)

    def process_raw_features(self, raw_obj):
        ''' Generate a feature vector from the raw features '''
        raise (NotImplementedError)

    def feature_vector(self, bytez, lief_binary):
        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
        if there are significant speedups to be gained from combining the two functions. '''
        return self.process_raw_features(self.raw_features(bytez, lief_binary))

class ByteHistogram(FeatureType):
    ''' Byte histogram (count + non-normalized) over the entire binary file '''

    name = 'histogram'
    dim = 256

    def __init__(self):
        super(FeatureType, self).__init__()

    def raw_features(self, bytez, lief_binary):
        counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
        return counts.tolist()

    def process_raw_features(self, raw_obj):
        counts = np.array(raw_obj, dtype=np.float32)
        sum = counts.sum()
        normalized = counts / sum
        return normalized

class ByteEntropyHistogram(FeatureType):
    ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
    This roughly approximates the joint probability of byte value and local entropy.
    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
    '''

    name = 'byteentropy'
    dim = 256

    def __init__(self, step=1024, window=2048):
        super(FeatureType, self).__init__()
        self.window = window
        self.step = step

    def _entropy_bin_counts(self, block):
        # coarse histogram, 16 bytes per bin
        c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
        p = c.astype(np.float32) / self.window
        wh = np.where(c)[0]
        H = np.sum(-p[wh] * np.log2(
            p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)

        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
        if Hbin == 16:  # handle entropy = 8.0 bits
            Hbin = 15

        return Hbin, c

    def raw_features(self, bytez, lief_binary):
        output = np.zeros((16, 16), dtype=np.int)
        a = np.frombuffer(bytez, dtype=np.uint8)
        if a.shape[0] < self.window:
            Hbin, c = self._entropy_bin_counts(a)
            output[Hbin, :] += c
        else:
            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
            strides = a.strides + (a.strides[-1],)
            blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]

            # from the blocks, compute histogram
            for block in blocks:
                Hbin, c = self._entropy_bin_counts(block)
                output[Hbin, :] += c

        return output.flatten().tolist()

    def process_raw_features(self, raw_obj):
        counts = np.array(raw_obj, dtype=np.float32)
        sum = counts.sum()
        normalized = counts / sum
        return normalized

class SectionInfo(FeatureType):
    ''' Information about section names, sizes and entropy.  Uses hashing trick
    to summarize all this section info into a feature vector.
    '''

    name = 'section'
    dim = 5 + 50 + 50 + 50 + 50 + 50

    def __init__(self):
        super(FeatureType, self).__init__()

    @staticmethod
    def _properties(s):
        return [str(c).split('.')[-1] for c in s.characteristics_lists]

    def raw_features(self, bytez, lief_binary):
        if lief_binary is None:
            return {"entry": "", "sections": []}

        # properties of entry point, or if invalid, the first executable section
        try:
            entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
        except lief.not_found:
            # bad entry point, let's find the first executable section
            entry_section = ""
            for s in lief_binary.sections:
                if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
                    entry_section = s.name
                    break

        raw_obj = {"entry": entry_section}
        raw_obj["sections"] = [{
            'name': s.name,
            'size': s.size,
            'entropy': s.entropy,
            'vsize': s.virtual_size,
            'props': self._properties(s)
        } for s in lief_binary.sections]
        return raw_obj

    def process_raw_features(self, raw_obj):
        sections = raw_obj['sections']
        general = [
            len(sections),  # total number of sections
            # number of sections with nonzero size
            sum(1 for s in sections if s['size'] == 0),
            # number of sections with an empty name
            sum(1 for s in sections if s['name'] == ""),
            # number of RX
            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
            # number of W
            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
        ]
        # gross characteristics of each section
        section_sizes = [(s['name'], s['size']) for s in sections]
        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
        section_entropy = [(s['name'], s['entropy']) for s in sections]
        section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
        section_vsize = [(s['name'], s['vsize']) for s in sections]
        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
        entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
        characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
        characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]

        return np.hstack([
            general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
            characteristics_hashed
        ]).astype(np.float32)

class DataDirectories(FeatureType):
    ''' Extracts size and virtual address of the first 15 data directories '''

    name = 'datadirectories'
    dim = 15 * 2

    def __init__(self):
        super(FeatureType, self).__init__()
        self._name_order = [
            "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
            "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
            "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
        ]

    def raw_features(self, bytez, lief_binary):
        output = []
        if lief_binary is None:
            return output

        for data_directory in lief_binary.data_directories:
            output.append({
                "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
                "size": data_directory.size,
                "virtual_address": data_directory.rva
            })
        return output

    def process_raw_features(self, raw_obj):
        features = np.zeros(2 * len(self._name_order), dtype=np.float32)
        for i in range(len(self._name_order)):
            if i < len(raw_obj):
                features[2 * i] = raw_obj[i]["size"]
                features[2 * i + 1] = raw_obj[i]["virtual_address"]
        return features

class PEFeatureExtractor(object):
    ''' Extract useful features from a PE file, and return as a vector of fixed size. '''

    def __init__(self, feature_version=2, print_feature_warning=True):
        self.features = [
            ByteHistogram(),
            ByteEntropyHistogram(),
            SectionInfo()
        ]
        if feature_version == 1:
            if not lief.__version__.startswith("0.8.3"):
                if print_feature_warning:
                    print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
                    print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
                    print(f"WARNING:   in the feature calculations.")
        elif feature_version == 2:
            self.features.append(DataDirectories())
            if not lief.__version__.startswith("0.9.0"):
                if print_feature_warning:
                    print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
                    print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
                    print(f"WARNING:   in the feature calculations.")
        else:
            raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
        self.dim = sum([fe.dim for fe in self.features])

    def raw_features(self, bytez):
        lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
                       RuntimeError)
        try:
            lief_binary = lief.PE.parse(list(bytez))
        except lief_errors as e:
            print("lief error: ", str(e))
            lief_binary = None
        except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
            raise

        features = {"sha256": hashlib.sha256(bytez).hexdigest()}
        features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
        return features

    def process_raw_features(self, raw_obj):
        feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
        return np.hstack(feature_vectors).astype(np.float32)

    def feature_vector(self, bytez):
        return self.process_raw_features(self.raw_features(bytez))


In [6]:
import os
import json
import tqdm
import numpy as np
import pandas as pd
import multiprocessing

"""
  Yield raw feature strings from the inputed file paths
"""
def raw_feature_iterator(file_paths):
    for path in file_paths:
        with open(path, "r") as fin:
            for line in fin:
                yield line

def vectorize_unpack(args):
    """
    Pass through function for unpacking vectorize arguments
    """
    return vectorize(*args)


def vectorize(irow, raw_features_string, X_path, y_path, extractor, nrows):
    """
    Vectorize a single sample of raw features and write to a large numpy file
    """
    raw_features = json.loads(raw_features_string)
    feature_vector = extractor.process_raw_features(raw_features)

    y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows)
    y[irow] = raw_features["label"]

    X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, extractor.dim))
    X[irow] = feature_vector


def vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows):
    """
    Vectorize a subset of data and write it to disk
    """
    # Create space on disk to write features to
    X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(nrows, extractor.dim))
    y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=nrows)
    del X, y

    # Distribute the vectorization work
    pool = multiprocessing.Pool()
    argument_iterator = ((irow, raw_features_string, X_path, y_path, extractor, nrows)
                         for irow, raw_features_string in enumerate(raw_feature_iterator(raw_feature_paths)))
    for _ in tqdm.tqdm(pool.imap_unordered(vectorize_unpack, argument_iterator), total=nrows):
        pass


def create_vectorized_features(data_dir, feature_version=2):
    """
    Create feature vectors from raw features and write them to disk
    """
    extractor = PEFeatureExtractor(feature_version)

    print("Vectorizing training set")
    X_path = os.path.join(data_dir, "X_train.dat")
    y_path = os.path.join(data_dir, "y_train.dat")
    raw_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)]
    nrows = sum([1 for fp in raw_feature_paths for line in open(fp)])
    vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows)

    print("Vectorizing test set")
    X_path = os.path.join(data_dir, "X_test.dat")
    y_path = os.path.join(data_dir, "y_test.dat")
    raw_feature_paths = [os.path.join(data_dir, "test_features.jsonl")]
    nrows = sum([1 for fp in raw_feature_paths for line in open(fp)])
    vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows)


def read_vectorized_features(data_dir, subset=None, feature_version=2):
    """
    Read vectorized features into memory mapped numpy arrays
    """
    if subset is not None and subset not in ["train", "test"]:
        return None

    extractor = PEFeatureExtractor(feature_version)
    ndim = extractor.dim
    X_train = None
    y_train = None
    X_test = None
    y_test = None

    if subset is None or subset == "train":
        X_train_path = os.path.join(data_dir, "X_train.dat")
        y_train_path = os.path.join(data_dir, "y_train.dat")
        y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
        N = y_train.shape[0]
        X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim))
        if subset == "train":
            return X_train, y_train

    if subset is None or subset == "test":
        X_test_path = os.path.join(data_dir, "X_test.dat")
        y_test_path = os.path.join(data_dir, "y_test.dat")
        y_test = np.memmap(y_test_path, dtype=np.float32, mode="r")
        N = y_test.shape[0]
        X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim))
        if subset == "test":
            return X_test, y_test

    return X_train, y_train, X_test, y_test
             
"""
  Decode a raw features string and return the metadata fields
"""
def read_metadata_record(raw_features_string):
    all_data = json.loads(raw_features_string)
    # metadata_keys = {"sha256", "label", "histogram", "byteentropy", "section"}
    metadata_keys = {"sha256"}
    return {k: all_data[k] for k in all_data.keys() & metadata_keys}


"""
  Write metadata to a csv file and return its dataframe
"""
def create_metadata(data_dir):
    pool = multiprocessing.Pool()

    train_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)]
    train_records = list(pool.imap(read_metadata_record, raw_feature_iterator(train_feature_paths)))

    # metadata_keys = ["sha256", "label", "histogram", "byteentropy", "section"]
    metadata_keys = ["sha256"]
    ordered_metadata_keys = [k for k in metadata_keys if k in train_records[0].keys()]

    train_metadf = pd.DataFrame(train_records)[ordered_metadata_keys]
    train_metadf.to_csv(os.path.join(data_dir, "train_metadata.csv"))

    train_records = [dict(record, **{"subset": "train"}) for record in train_records]

    test_feature_paths = [os.path.join(data_dir, "test_features.jsonl")]
    test_records = list(pool.imap(read_metadata_record, raw_feature_iterator(test_feature_paths)))

    test_metadf = pd.DataFrame(test_records)[ordered_metadata_keys]
    test_metadf.to_csv(os.path.join(data_dir, "test_metadata.csv"))

    test_records = [dict(record, **{"subset": "test"}) for record in test_records]

    all_metadata_keys = ordered_metadata_keys + ["subset"]
    metadf = pd.DataFrame(train_records + test_records)[all_metadata_keys]
    metadf.to_csv(os.path.join(data_dir, "metadata.csv"))
    return metadf

"""
  Read an already created metadata file and return its dataframe
"""
def read_metadata(data_dir):
    return pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0)



In [7]:
# Thực hiện hàm create_vectorized_features() để tạo vector đặc trưng từ các tính năng của DL Ember
create_vectorized_features("/content/ember_2017_2/")

Vectorizing training set


100%|██████████| 900000/900000 [02:11<00:00, 6843.74it/s]


Vectorizing test set


100%|██████████| 200000/200000 [00:34<00:00, 5752.69it/s]


In [None]:
# tạo biến lưu trữ kết quả vừa thu được từ việc chuyển sang vector
data_features = read_vectorized_features("/content/ember_2017_2/")
data_features

In [9]:
# lấy ra vector đặc trưng và label từ ma trận trên
X_train = data_features[0]
y_train = data_features[1].astype(int)
X_test = data_features[2]
y_test = data_features[3].astype(int)

In [10]:
# xây dựng DataFrame mới từ vector đặc trưng và label tương ứng
dt_Train = pd.DataFrame(X_train)
dt_Train['label'] = pd.Series(y_train, index = dt_Train.index)

dt_Test = pd.DataFrame(X_test)
dt_Test['label'] = pd.Series(y_test, index = dt_Test.index)

In [11]:
# kết hợp tập Train và Test lại
dt_features = pd.concat([dt_Train,dt_Test])
dt_features.shape

(1100000, 2382)

In [12]:
# Thực hiện hàm create_metadata() để tạo lấy dl cần thiết (lấy sha256)
create_metadata("/content/ember_2017_2/")

Unnamed: 0,sha256,subset
0,0abb4fda7d5b13801d63bee53e5e256be43e141faa077a...,train
1,d4206650743b3d519106dea10a38a55c30467c3d9f7875...,train
2,c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15...,train
3,7f513818bcc276c531af2e641c597744da807e21cc1160...,train
4,ca65e1c387a4cc9e7d8a8ce12bf1bcf9f534c9032b9d95...,train
...,...,...
1099995,fffe314f23cee3a68ccab272934877d3bc18ec3bd905df...,test
1099996,fffe7a1b23e04facc9ca91a93ac4a34e8b3040e023dbde...,test
1099997,fffe801f51e7ec931515aa49a3d157a9c0fbcdca8c9d80...,test
1099998,fffe92f9593649c4a7050302368189de45e2c1c06b04ea...,test


In [13]:
# khởi tạo biến lưu trữ từ metadata.csv 
emberdf = read_metadata('/content/ember_2017_2/')
emberdf.head()

  mask |= (ar1 == a)


Unnamed: 0,sha256,subset
0,0abb4fda7d5b13801d63bee53e5e256be43e141faa077a...,train
1,d4206650743b3d519106dea10a38a55c30467c3d9f7875...,train
2,c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15...,train
3,7f513818bcc276c531af2e641c597744da807e21cc1160...,train
4,ca65e1c387a4cc9e7d8a8ce12bf1bcf9f534c9032b9d95...,train


In [14]:
# chuyển type DataFrame trên sang mảng
temp = emberdf['sha256'].to_numpy()
temp

array(['0abb4fda7d5b13801d63bee53e5e256be43e141faa077a6d149874242c3f02c2',
       'd4206650743b3d519106dea10a38a55c30467c3d9f78758690a8bbf478e5b6d4',
       'c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15b1af16ab9d6cf65065',
       ...,
       'fffe801f51e7ec931515aa49a3d157a9c0fbcdca8c9d80f942619e98bbdcca23',
       'fffe92f9593649c4a7050302368189de45e2c1c06b04ea398d0e0f4f594e81da',
       'ffffb259a4c5e25ae1437af59caafb718cf8879187cc8cec61d284345e56e79e'],
      dtype=object)

In [15]:
# thêm mảng sha256 vào tập dữ liệu vector vừa thu được
dt_features['sha256'] = pd.Series(temp, index = dt_features.index)
dt_features.shape

(1100000, 2383)

In [16]:
# kiểm tra thử sha256 có trùng lập không?
mylist = list(dict.fromkeys(dt_features['sha256']))
len(mylist)

1100000

In [17]:
# Kết nối tới Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# lưu kết quả DataFrame mới lại thành file (sha256, label, vector đặc trưng)
# dt_features.to_csv('metaData_features.csv', index=False)

# lưu thẳng vào drive
dt_features.to_csv('/content/drive/MyDrive/BIG_Data/metaData_features.csv', index=False)

In [None]:
# Di chuyển file thu được vào thư mục trong Google Drive 
# !cp /content/metaData_features.csv /content/drive/MyDrive/BIG_Data/metaData_features.csv

In [None]:
# lấy những malware có label = 1 / lọc bớt DL lại
emberdf_label_1 = dt_features[dt_features.label==1].reset_index(drop = True)

In [None]:
# phân chia tập DL ngẫu nhiên
fold = 200 
from sklearn.model_selection import GroupKFold

gkf  = GroupKFold(n_splits = 400)
emberdf_label_1['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(emberdf_label_1, groups = emberdf_label_1.sha256.tolist())):
    emberdf_label_1.loc[val_idx, 'fold'] = fold

emberdf_label_1.shape


In [None]:
emberdf_label_1.head(5)

In [None]:
emberdf_label_1.to_csv('/content/drive/MyDrive/BIG_Data/metaData_features_label_1.csv', index=False)

In [None]:
# lấy ngẫu nhiên các tập nhỏ 
ember_df1 = emberdf_label_1[emberdf_label_1.fold==2].reset_index(drop = True)
ember_df2 = emberdf_label_1[emberdf_label_1.fold==4].reset_index(drop = True)
ember_df3 = emberdf_label_1[emberdf_label_1.fold==6].reset_index(drop = True)
ember_df4 = emberdf_label_1[emberdf_label_1.fold==8].reset_index(drop = True)
ember_df5 = emberdf_label_1[emberdf_label_1.fold==10].reset_index(drop = True)
ember_df6 = emberdf_label_1[emberdf_label_1.fold==12].reset_index(drop = True)
ember_df7 = emberdf_label_1[emberdf_label_1.fold==16].reset_index(drop = True)
ember_df8 = emberdf_label_1[emberdf_label_1.fold==18].reset_index(drop = True)
ember_df9 = emberdf_label_1[emberdf_label_1.fold==20].reset_index(drop = True)
ember_df10 = emberdf_label_1[emberdf_label_1.fold==22].reset_index(drop = True)
ember_df11 = emberdf_label_1[emberdf_label_1.fold==24].reset_index(drop = True)
ember_df12 = emberdf_label_1[emberdf_label_1.fold==26].reset_index(drop = True)
ember_df13 = emberdf_label_1[emberdf_label_1.fold==27].reset_index(drop = True)
ember_df14 = emberdf_label_1[emberdf_label_1.fold==28].reset_index(drop = True)
ember_df15 = emberdf_label_1[emberdf_label_1.fold==29].reset_index(drop = True)
ember_df16 = emberdf_label_1[emberdf_label_1.fold==32].reset_index(drop = True)
ember_df17 = emberdf_label_1[emberdf_label_1.fold==35].reset_index(drop = True)
ember_df18 = emberdf_label_1[emberdf_label_1.fold==36].reset_index(drop = True)
ember_df19 = emberdf_label_1[emberdf_label_1.fold==37].reset_index(drop = True)
ember_df20 = emberdf_label_1[emberdf_label_1.fold==42].reset_index(drop = True)
ember_df21 = emberdf_label_1[emberdf_label_1.fold==45].reset_index(drop = True)
ember_df22 = emberdf_label_1[emberdf_label_1.fold==52].reset_index(drop = True)
ember_df23 = emberdf_label_1[emberdf_label_1.fold==54].reset_index(drop = True)
ember_df24 = emberdf_label_1[emberdf_label_1.fold==63].reset_index(drop = True)
ember_df25 = emberdf_label_1[emberdf_label_1.fold==62].reset_index(drop = True)
ember_df26 = emberdf_label_1[emberdf_label_1.fold==72].reset_index(drop = True)
ember_df27 = emberdf_label_1[emberdf_label_1.fold==81].reset_index(drop = True)
ember_df28 = emberdf_label_1[emberdf_label_1.fold==82].reset_index(drop = True)
ember_df29 = emberdf_label_1[emberdf_label_1.fold==88].reset_index(drop = True)
ember_df30 = emberdf_label_1[emberdf_label_1.fold==90].reset_index(drop = True)
ember_df31 = emberdf_label_1[emberdf_label_1.fold==91].reset_index(drop = True)
ember_df32 = emberdf_label_1[emberdf_label_1.fold==92].reset_index(drop = True)
ember_df33 = emberdf_label_1[emberdf_label_1.fold==93].reset_index(drop = True)
ember_df34 = emberdf_label_1[emberdf_label_1.fold==94].reset_index(drop = True)
ember_df35 = emberdf_label_1[emberdf_label_1.fold==95].reset_index(drop = True)
ember_df36 = emberdf_label_1[emberdf_label_1.fold==96].reset_index(drop = True)

In [None]:
print("shape : ", ember_df30.shape)
print(ember_df30.head(10))

In [None]:
# lưu các tập vào Drive 
ember_df1.to_csv("/content/drive/MyDrive/Data_malware/ember/df1.csv")
ember_df2.to_csv("/content/drive/MyDrive/Data_malware/ember/df2.csv")
ember_df3.to_csv("/content/drive/MyDrive/Data_malware/ember/df3.csv")
ember_df4.to_csv("/content/drive/MyDrive/Data_malware/ember/df4.csv")
ember_df5.to_csv("/content/drive/MyDrive/Data_malware/ember/df5.csv")
ember_df6.to_csv("/content/drive/MyDrive/Data_malware/ember/df6.csv")
ember_df7.to_csv("/content/drive/MyDrive/Data_malware/ember/df7.csv")
ember_df8.to_csv("/content/drive/MyDrive/Data_malware/ember/df8.csv")
ember_df9.to_csv("/content/drive/MyDrive/Data_malware/ember/df9.csv")
ember_df10.to_csv("/content/drive/MyDrive/Data_malware/ember/df10.csv")
ember_df11.to_csv("/content/drive/MyDrive/Data_malware/ember/df11.csv")
ember_df12.to_csv("/content/drive/MyDrive/Data_malware/ember/df12.csv")
ember_df13.to_csv("/content/drive/MyDrive/Data_malware/ember/df13.csv")
ember_df14.to_csv("/content/drive/MyDrive/Data_malware/ember/df14.csv")
ember_df15.to_csv("/content/drive/MyDrive/Data_malware/ember/df15.csv")
ember_df16.to_csv("/content/drive/MyDrive/Data_malware/ember/df16.csv")
ember_df17.to_csv("/content/drive/MyDrive/Data_malware/ember/df17.csv")
ember_df18.to_csv("/content/drive/MyDrive/Data_malware/ember/df18.csv")
ember_df19.to_csv("/content/drive/MyDrive/Data_malware/ember/df19.csv")
ember_df20.to_csv("/content/drive/MyDrive/Data_malware/ember/df20.csv")
ember_df21.to_csv("/content/drive/MyDrive/Data_malware/ember/df21.csv")
ember_df22.to_csv("/content/drive/MyDrive/Data_malware/ember/df22.csv")
ember_df23.to_csv("/content/drive/MyDrive/Data_malware/ember/df23.csv")
ember_df24.to_csv("/content/drive/MyDrive/Data_malware/ember/df24.csv")
ember_df25.to_csv("/content/drive/MyDrive/Data_malware/ember/df25.csv")
ember_df26.to_csv("/content/drive/MyDrive/Data_malware/ember/df26.csv")
ember_df27.to_csv("/content/drive/MyDrive/Data_malware/ember/df27.csv")
ember_df28.to_csv("/content/drive/MyDrive/Data_malware/ember/df28.csv")
ember_df29.to_csv("/content/drive/MyDrive/Data_malware/ember/df29.csv")
ember_df30.to_csv("/content/drive/MyDrive/Data_malware/ember/df30.csv")
ember_df31.to_csv("/content/drive/MyDrive/Data_malware/ember/df31.csv")
ember_df32.to_csv("/content/drive/MyDrive/Data_malware/ember/df32.csv")
ember_df33.to_csv("/content/drive/MyDrive/Data_malware/ember/df33.csv")
ember_df34.to_csv("/content/drive/MyDrive/Data_malware/ember/df34.csv")
ember_df35.to_csv("/content/drive/MyDrive/Data_malware/ember/df35.csv")
ember_df36.to_csv("/content/drive/MyDrive/Data_malware/ember/df36.csv")