In [5]:
import os
import ember
import lightgbm as lgb
import numpy as np
import pandas as pd
binary_name = "mimikatz.exe" #set this to the binary name, put the ones u want to test in /binaries

data_dir = "../ember2018/"

In [6]:
binary_location = f"/workspaces/torment-nexus/binaries/{binary_name}"

def get_feature_names() -> list[str]:
    byte_histogram = [f"Byte Histogram {a}" for a in range(256)] #256

    byte_entropy_histogram = [f"Byte Entropy Histogram {a}" for a in range(256)] #256

    strings_1 = [f"string.{a}" for a in ["numstrings", "avlength", "printables"]]
    strings_2 = [f"string.printabledist_{b}" for b in range(96)]
    strings_3 = [f"string.{a}" for a in ["entropy", "paths", "urls", "registry", "MZ"]] # 8 + 96
    strings = np.concatenate((strings_1,strings_2,strings_3))

    general_info = [f"general.{a}" for a in ["size", "vsize", "has_debug", "exports", "imports", "has_relocations", "has_resources", "has_signature", "has_tls", "symbols"]]

    header_coff = ["header.coff.timestamp"]
    header_coff_machine = [f"header.coff.machine_{a}" for a in range(10)]
    header_coff_characteristics = [f"header.coff.characteristic_{a}" for a in range(10)]
    header_coff_subsystem = [f"header.optional.subsystem_{a}" for a in range(10)]
    header_coff_dll_characteristics = [f"header.optional.dll_characteristic_{a}" for a in range(10)]
    header_coff_magic = [f"header.optional.magic_{a}" for a in range(10)]
    header_optional = [f"header.optional.{a}" for a in ["major_image_version", "minor_image_version", "major_linker_version", "minor_linker_version", "major_operating_system_version", "minor_operating_system_version", "major_subsystem_version", "minor_subsystem_version", "sizeof_code", "sizeof_headers", "sizeof_heap_commit"]] #12
    header = np.concatenate((header_coff,header_coff_machine,header_coff_characteristics,header_coff_subsystem,header_coff_dll_characteristics,header_coff_magic,header_optional))

    sections_general = [f"sections.{a}" for a in ["section_count", "num_empty_sections", "num_unnamed_sections", "num_read_execute_sections", "num_write_sections",]] #JUST general
    sections_section_sizes = [f"sections.section_{a}_size" for a in range(50)] # this messes with hashing which i will understand at a later time
    sections_section_entropy = [f"sections.section_{a}_entropy" for a in range(50)]
    sections_section_vsize = [f"sections.section_{a}_vsize" for a in range(50)]
    sections_entry_name = [f"sections.entry_name_{a}" for a in range(50)]
    sections_characteristics = [f"sections.characteristics_{a}" for a in range(50)]
    sections = np.concatenate((sections_general, sections_section_sizes, sections_section_entropy, sections_section_vsize, sections_entry_name, sections_characteristics))

    imports_libraries = [f"imports.libraries.library_{a}" for a in range(256)]
    imports_imports = [f"imports.import_{a}" for a in range(1024)]
    imports = np.concatenate((imports_libraries,imports_imports))

    exports = [f"exports.export_{a}" for a in range(128)]

    name_order = [a.lower() for a in ["EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE","BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE","BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"]]
    data_directories_unflat = [[f"directories.{a}_size", f"directories.{a}_vaddress"] for a in name_order]
    data_directories = [item for sublist in data_directories_unflat for item in sublist]


    feature_names = np.concatenate((byte_histogram, byte_entropy_histogram, strings, general_info, header, sections, imports, exports, data_directories))
    return feature_names

def classify_binary(binary_location:str) -> float:
    lgbm_model = lgb.Booster(model_file=os.path.join(data_dir, "ember_model_2018.txt"))
    extractor2 = ember.PEFeatureExtractor(2)

    file_data = open(binary_location, "rb").read()
    feature_vector = extractor2.feature_vector(file_data)

    return lgbm_model.predict([np.array(feature_vector, dtype=np.float32)])[0]

def classify_vectors(feature_vector:np.ndarray | pd.DataFrame) -> float:
    if type(feature_vector) is pd.DataFrame:
        feature_vector = feature_vector.to_numpy()

    lgbm_model = lgb.Booster(model_file=os.path.join(data_dir, "ember_model_2018.txt"))
    return lgbm_model.predict([np.array(feature_vector, dtype=np.float32)])[0]

def get_vectors(binary_location:str) -> np.ndarray:
    extractor2 = ember.PEFeatureExtractor(2)
    file_data = open(binary_location, "rb").read()
    return extractor2.feature_vector(file_data)

def get_dataframe(feature_vector:np.ndarray) -> pd.DataFrame:
    df = pd.DataFrame(feature_vector).T
    df.columns = get_feature_names()
    return df

In [7]:
get_dataframe(get_vectors(binary_location))



Unable to find the section associated with EXPORT_TABLE
Can't read the export table at 0xffffffff


Unnamed: 0,Byte Histogram 0,Byte Histogram 1,Byte Histogram 2,Byte Histogram 3,Byte Histogram 4,Byte Histogram 5,Byte Histogram 6,Byte Histogram 7,Byte Histogram 8,Byte Histogram 9,...,directories.load_config_table_size,directories.load_config_table_vaddress,directories.bound_import_size,directories.bound_import_vaddress,directories.iat_size,directories.iat_vaddress,directories.delay_import_descriptor_size,directories.delay_import_descriptor_vaddress,directories.clr_runtime_header_size,directories.clr_runtime_header_vaddress
0,0.206864,0.008583,0.00417,0.00469,0.007864,0.002909,0.002785,0.00268,0.00986,0.001998,...,0.0,0.0,0.0,0.0,2416.0,610304.0,96.0,912092.0,0.0,0.0


In [8]:

def objective_function(df):
    return classify_vectors(df)

df = get_dataframe(get_vectors(binary_location))

num_particles = 30
num_iterations = 100
dim = df.shape[1]
c1 = c2 = 1.49445
w = 0.729
boundsdict = {"header.coff.timestamp": (0, 0xFFFFFFFF), "directories.certificate_table_size": (0,0xFFFFFFFF)}
changeable_str = ["header.coff.timestamp", "directories.certificate_table_size"]
bounds = []
changeable = []
for index,feature in enumerate(df):
    if feature not in changeable_str:
        bounds.append((df[feature].iloc[0], df[feature].iloc[0]))
    else:
        bounds.append(boundsdict[feature])
        changeable.append(index)

particles = np.random.uniform([b[0] for b in bounds], [b[1] for b in bounds], (num_particles, dim))
velocities = np.random.uniform(-1, 1, (num_particles, dim))
personal_best_positions = np.copy(particles)
personal_best_scores = np.array([objective_function(p) for p in particles])
global_best_position = personal_best_positions[np.argmin(personal_best_scores)]


for iteration in range(num_iterations):
    for i in range(num_particles):
        for j in changeable:
            velocities[i, j] = (w * velocities[i, j] +
                                c1 * np.random.rand() * (personal_best_positions[i, j] - particles[i, j]) +
                                c2 * np.random.rand() * (global_best_position[j] - particles[i, j]))
            particles[i, j] += velocities[i, j]
            particles[i, j] = np.clip(particles[i, j], bounds[j][0], bounds[j][1])
        score = objective_function(particles[i])
        if score < personal_best_scores[i]:
            personal_best_scores[i] = score
            personal_best_positions[i] = particles[i]
            
    global_best_position = personal_best_positions[np.argmin(personal_best_scores)]
    print(f"Score so far: {global_best_position}, iteration #{iteration+1}")

print("Best position:", global_best_position)
print("Best score:", objective_function(global_best_position))




Unable to find the section associated with EXPORT_TABLE
Can't read the export table at 0xffffffff


Score so far: [2.06864238e-01 8.58298130e-03 4.16987715e-03 ... 9.12092000e+05
 0.00000000e+00 0.00000000e+00], iteration #1
Score so far: [2.06864238e-01 8.58298130e-03 4.16987715e-03 ... 9.12092000e+05
 0.00000000e+00 0.00000000e+00], iteration #2
Score so far: [2.06864238e-01 8.58298130e-03 4.16987715e-03 ... 9.12092000e+05
 0.00000000e+00 0.00000000e+00], iteration #3


KeyboardInterrupt: 

In [None]:
for index,feature in enumerate(df):
    print(f"{feature}, {index}")