In [2]:
import os
import ember
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

import lime
from lime import lime_tabular

data_dir = "../ember2018/"

In [15]:
# only run this if your ember2018 folder doesnt have any .dat files
ember.create_vectorized_features(data_dir, 2)
_ = ember.create_metadata(data_dir)

Vectorizing training set


100%|██████████| 800000/800000 [09:10<00:00, 1454.39it/s]


Vectorizing test set


100%|██████████| 200000/200000 [02:06<00:00, 1583.29it/s]


In [3]:
lgbm_model = ember.train_model("/workspaces/torment-nexus/ember2018")

[LightGBM] [Info] Number of positive: 300000, number of negative: 300000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 212057
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 2333
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [4]:
emberdf = ember.read_metadata(data_dir)
X_train, y_train, X_test, y_test = ember.read_vectorized_features(data_dir, feature_version=2)
lgbm_model = lgb.Booster(model_file=os.path.join(data_dir, "ember_model_2018.txt"))
lgbm_model.params['objective'] = 'regression'

In [6]:
extractor1 = ember.PEFeatureExtractor(1)
extractor2 = ember.PEFeatureExtractor(2)

file_data = open("/workspaces/torment-nexus/binaries/postgresql-16.2-1-windows-x64.exe", "rb").read()





In [7]:
def flatten_dict(prefix, d):
    flattened = {}
    for key, value in d.items():
        new_key = f"{prefix}_{key}" if prefix else key
        if isinstance(value, dict):
            flattened.update(flatten_dict(new_key, value))
        elif isinstance(value, list):
            for i, item in enumerate(value):
                flattened[f"{new_key}_{i}"] = item
        else:
            flattened[new_key] = value
    return flattened

In [8]:
raw_features = extractor2.raw_features(file_data)
np.array(raw_features)

array({'sha256': 'cbdf32c0be69c69cb526367e423ccb7c96f5cf221866037c693302696a3057cb', 'histogram': [2122573, 1456502, 1390181, 1479044, 1410129, 1404690, 1438135, 1491895, 1395571, 1379818, 1372636, 1458099, 1451108, 1444674, 1508956, 1572563, 1360427, 1384864, 1355478, 1435379, 1393647, 1397328, 1441496, 1507088, 1466477, 1428589, 1424410, 1493034, 1477865, 1494945, 1521811, 1595165, 1372767, 1356284, 1360330, 1435028, 1514697, 1396438, 1412825, 1482352, 1397302, 1405237, 1376327, 1450811, 1444067, 1445791, 1468720, 1551373, 1445376, 1453729, 1403485, 1464992, 1426302, 1423108, 1463412, 1513231, 1469641, 1478736, 1484503, 1516613, 1573618, 1536143, 1577993, 1641137, 1358389, 1406846, 1365929, 1434611, 1416002, 1428200, 1429739, 1489853, 1355497, 1407977, 1388809, 1447015, 1413687, 1432097, 1459367, 1546191, 1361923, 1412687, 1378999, 1437145, 1380130, 1407241, 1437295, 1492935, 1441962, 1447418, 1428212, 1479504, 1482582, 1486711, 1527005, 1609612, 1422769, 1472665, 1424973, 1500647, 1

In [9]:
feature_vector = extractor2.feature_vector(file_data)
np.array(feature_vector, dtype=np.float32)

array([0.00557905, 0.00382832, 0.003654  , ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [10]:
ember.predict_sample(lgbm_model, file_data)

2.0144306326638549e-07

In [11]:
lgbm_model.predict([np.array(feature_vector, dtype=np.float32)])[0]

2.0144306326638549e-07

In [12]:
flat = dict(flatten_dict("",raw_features))
df = pd.DataFrame(flat)
df.to_csv('/workspaces/torment-nexus/resources/output2.csv', index=False)

In [13]:
explainer = shap.TreeExplainer(lgbm_model)

In [23]:
shap_values = explainer(np.array([feature_vector], dtype=np.float32))

In [24]:
shap.initjs()
shap.force_plot(shap_values)


In [None]:
shap.summary_plot(shap_values, X_test)