In [1]:
#loading the dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "joyce8/EMBER2024",
    split="train",
    streaming=True
)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
#first header row of dataset

In [5]:
sample = next(iter(dataset))
print(sample.keys())




In [6]:
from itertools import islice

for i, row in enumerate(islice(dataset, 5)):
    print(f"Row {i+1}")
    print(row)
    print("-" * 50)


Row 1
--------------------------------------------------
Row 2
--------------------------------------------------
Row 3
--------------------------------------------------
Row 4
--------------------------------------------------
Row 5
--------------------------------------------------


In [7]:
#drop unwanted columns

In [8]:
DROP_COLUMNS = [
    "md5", "sha1", "sha256", "tlsh",
    "first_submission_date", "last_analysis_date",
    "family", "family_confidence",
    "week_id"
]


In [9]:
#Decide target column

In [10]:
TARGET_COLUMN = "label"


In [11]:
#encode file_type column

In [12]:
FILE_TYPE_MAP = {
    "pe": 0,
    "apk": 1,
    "elf": 2,
    "pdf": 3,
    "dotnet": 4
}
DEFAULT_FILE_TYPE = -1


In [13]:
FEATURE_COLUMNS = [
    "histogram",
    "byteentropy",
    "strings",
    "general"
]



In [14]:
sample = next(iter(dataset))

for feature in FEATURE_COLUMNS:
    print(feature, ":", len(sample[feature]))


histogram : 256
byteentropy : 256
strings : 6
general : 4


In [15]:
#feature extraction function

In [16]:
def flatten_feature(value):

    flat = []

    if isinstance(value, dict):
        for k in sorted(value.keys()):
            flat.extend(flatten_feature(value[k]))

    elif isinstance(value, list):
        for item in value:
            flat.extend(flatten_feature(item))

    else:
        # Base case: number
        flat.append(value)

    return flat


In [17]:
row0 = next(iter(dataset))

STRING_KEYS = sorted(row0["strings"].keys())
GENERAL_KEYS = sorted(row0["general"].keys())

print("String keys:", STRING_KEYS)
print("General keys:", GENERAL_KEYS)


String keys: ['avlength', 'entropy', 'numstrings', 'printabledist', 'printables', 'string_counts']
General keys: ['entropy', 'is_pe', 'size', 'start_bytes']


In [18]:
STRING_KEYS = [
    "avlength",
    "entropy",
    "numstrings",
    "printables",
    "string_counts"
]

GENERAL_KEYS = [
    "entropy",
    "is_pe",
    "size"
]


In [19]:
def safe_float(x):
    try:
        return float(x)
    except:
        return 0.0


In [20]:
FEATURE_COLUMNS = [
    "histogram",
    "byteentropy",
    "strings",
    "general"
]

FILE_TYPES = ["pe","apk","elf","pdf","dotnet"]

def encode_file_type(ft):
    vec = [0]*len(FILE_TYPES)
    if ft in FILE_TYPES:
        vec[FILE_TYPES.index(ft)] = 1
    return vec


def extract_features(row):

    vec = []

    # Histogram (always numeric list)
    for v in row["histogram"]:
        vec.append(safe_float(v))

    # Byte entropy
    for v in row["byteentropy"]:
        vec.append(safe_float(v))

    # Strings (safe scalar fields)
    for k in STRING_KEYS:
        vec.append(safe_float(row["strings"].get(k, 0)))

    # General (safe scalar fields)
    for k in GENERAL_KEYS:
        vec.append(safe_float(row["general"].get(k, 0)))

    # File type encoding
    vec.extend(encode_file_type(row["file_type"]))

    return vec


In [21]:
import numpy as np

X = []
y = []

MAX_SAMPLES = 20000   # adjust based on RAM

for i, row in enumerate(dataset):
    
    X.append(extract_features(row))
    y.append(row["label"])
    
    if i == MAX_SAMPLES:
        break

import numpy as np

X = np.array(X, dtype=np.float32)
X = np.nan_to_num(X)

y = np.array(y, dtype=np.int32)



In [22]:
print("Shape:", X.shape)
print("Dtype:", X.dtype)


Shape: (20001, 525)
Dtype: float32


In [23]:
#check class balance

In [24]:
import numpy as np

unique, counts = np.unique(y, return_counts=True)

print(dict(zip(unique, counts)))


{np.int32(0): np.int64(10000), np.int32(1): np.int64(10001)}


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,        # 80% train, 20% test
    random_state=42,
    stratify=y            # preserve balance
)

print(X_train.shape, X_test.shape)


(16000, 525) (4001, 525)


In [26]:
print(X.dtype)


float32


In [27]:
row = next(iter(dataset))

for f in FEATURE_COLUMNS:
    print(f, type(row[f]))


histogram <class 'list'>
byteentropy <class 'list'>
strings <class 'dict'>
general <class 'dict'>


In [28]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",150
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",20
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [29]:
#evaluation

In [30]:
#make predictions

In [31]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]


In [32]:
#print metrics

In [33]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96      2000
           1       0.97      0.95      0.96      2001

    accuracy                           0.96      4001
   macro avg       0.96      0.96      0.96      4001
weighted avg       0.96      0.96      0.96      4001

ROC-AUC: 0.9933610694652673


In [34]:
import joblib
joblib.dump(rf, "rf_ember.pkl")


['rf_ember.pkl']