In [1]:
# Library
import numpy as np
import pandas as pd
import threading
from sklearn.feature_selection import (chi2, f_classif, mutual_info_classif, VarianceThreshold, RFE, SequentialFeatureSelector, SelectFromModel)
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from deap import base, creator, tools, algorithms  # For genetic algorithm in feature selection
from boruta import BorutaPy  # For Boruta algorithm
from sklearn.svm import SVR  # Example estimator for RFE and SFS
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
# Input Dataset 

DM = pd.read_csv("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\Obfuscated-MalMem2022\\Obfuscated-MalMem2022.csv") #DM--> Dataset Malware

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58596 entries, 0 to 58595
Data columns (total 57 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Category                                58596 non-null  object 
 1   pslist.nproc                            58596 non-null  int64  
 2   pslist.nppid                            58596 non-null  int64  
 3   pslist.avg_threads                      58596 non-null  float64
 4   pslist.nprocs64bit                      58596 non-null  int64  
 5   pslist.avg_handlers                     58596 non-null  float64
 6   dlllist.ndlls                           58596 non-null  int64  
 7   dlllist.avg_dlls_per_proc               58596 non-null  float64
 8   handles.nhandles                        58596 non-null  int64  
 9   handles.avg_handles_per_proc            58596 non-null  float64
 10  handles.nport                           58596 non-null  in

In [4]:
#Feature Selection
X = DM.drop(['Category','Class'],axis=1).values    #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = DM['Class'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Store selected features
features = {}

In [7]:
# Function to print and append selected features
def print_selected_features(method_name, selected_indices, importance=None):
    selected_features = []
    print(f"\n{method_name} Selected Features:")
    for i, idx in enumerate(selected_indices):
        feature_name = DM.columns[idx]
        if importance is not None:
            print(f"{i + 1}. Feature {feature_name} (Importance: {importance[idx]:.6f})")
        else:
            print(f"{i + 1}. Feature {feature_name}")
        selected_features.append(feature_name)
    features[method_name] = selected_features

In [8]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)

# 1. Chi-Square Test (Filter)
chi2_selector = chi2(X_chi2, y)
chi2_scores = chi2_selector[0]
chi2_top_features = np.argsort(chi2_scores)[::-1][:5]
print_selected_features("Chi-Square", chi2_top_features, chi2_scores)


Chi-Square Selected Features:
1. Feature handles.avg_handles_per_proc (Importance: nan)
2. Feature svcscan.shared_process_services (Importance: nan)
3. Feature pslist.avg_threads (Importance: nan)
4. Feature handles.nsection (Importance: 2535.573241)
5. Feature handles.nfile (Importance: 1618.806917)


In [9]:
# 2. ANOVA F-test (Filter)

#Identify constant features (ada Constant Features)
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
X_filtered = X.drop(columns=constant_features)

anova_selector = f_classif(X_filtered, y)
anova_scores = anova_selector[0]
anova_top_features = np.argsort(anova_scores)[::-1][:5]
print_selected_features("ANOVA", anova_top_features, anova_scores)


ANOVA Selected Features:
1. Feature pslist.avg_handlers (Importance: 277133.119001)
2. Feature handles.avg_handles_per_proc (Importance: 194758.424111)
3. Feature handles.nevent (Importance: 186888.368384)
4. Feature handles.nsemaphore (Importance: 159452.876708)
5. Feature pslist.nprocs64bit (Importance: 123864.994761)


In [10]:
# 3. Mutual Information (Filter)
mutual_info_scores = mutual_info_classif(X, y)
mutual_info_top_features = np.argsort(mutual_info_scores)[::-1][:5]
print_selected_features("Mutual Information", mutual_info_top_features, mutual_info_scores)


Mutual Information Selected Features:
1. Feature modules.nmodules (Importance: 0.687163)
2. Feature svcscan.process_services (Importance: 0.681367)
3. Feature svcscan.nservices (Importance: 0.674654)
4. Feature dlllist.ndlls (Importance: 0.672364)
5. Feature handles.nhandles (Importance: 0.658678)


In [11]:
# 4. Variance Threshold (Filter)
vt_selector = VarianceThreshold(threshold=0.1)  # You can adjust the threshold
X_vt = vt_selector.fit_transform(X)
vt_top_features = np.where(vt_selector.get_support())[0]
print_selected_features("Variance Threshold", vt_top_features)


Variance Threshold Selected Features:
1. Feature Category
2. Feature pslist.nproc
3. Feature pslist.nppid
4. Feature pslist.nprocs64bit
5. Feature pslist.avg_handlers
6. Feature dlllist.ndlls
7. Feature dlllist.avg_dlls_per_proc
8. Feature handles.nhandles
9. Feature handles.nport
10. Feature handles.nfile
11. Feature handles.nevent
12. Feature handles.ndesktop
13. Feature handles.nkey
14. Feature handles.nthread
15. Feature handles.ndirectory
16. Feature handles.nsemaphore
17. Feature handles.ntimer
18. Feature handles.nsection
19. Feature handles.nmutant
20. Feature ldrmodules.not_in_load
21. Feature ldrmodules.not_in_init
22. Feature ldrmodules.not_in_mem_avg
23. Feature malfind.ninjections
24. Feature malfind.commitCharge
25. Feature malfind.protection
26. Feature malfind.uniqueInjections
27. Feature psxview.not_in_eprocess_pool
28. Feature psxview.not_in_ethread_pool
29. Feature psxview.not_in_pspcid_list
30. Feature psxview.not_in_csrss_handles
31. Feature psxview.not_in_session

In [12]:
# 5. Recursive Feature Elimination (RFE) - Wrapper
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)
rfe_top_features = np.where(rfe_selector.support_)[0]
print_selected_features("RFE", rfe_top_features)


RFE Selected Features:
1. Feature dlllist.ndlls
2. Feature handles.nsection
3. Feature modules.nmodules
4. Feature svcscan.nservices
5. Feature svcscan.process_services


In [13]:
# 6. Sequential Feature Selector (SFS) - Wrapper (Running too Long)

# Encode the target variable (Y Still contain String)
#label_encoder = LabelEncoder()
#y_encoded = label_encoder.fit_transform(y)

# Sequential Feature Selector
#sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"), n_features_to_select=5)
#X_sfs = sfs_selector.fit_transform(X, y_encoded)
#sfs_top_features = np.where(sfs_selector.get_support())[0]
#print_selected_features("SFS", sfs_top_features)

In [14]:
# 6. Sequential Feature Selector (SFS) - Wrapper

# Function to run SFS
def run_sfs(X, y_encoded):
    global sfs_selector, X_sfs
    sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"),n_features_to_select=5, direction='forward', n_jobs=-1)
    X_sfs = sfs_selector.fit_transform(X, y_encoded)

# Encode the target variable (Y Still contain String)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running SFS
sfs_thread = threading.Thread(target=run_sfs, args=(X, y_encoded))

# Start the thread
sfs_thread.start()

# Wait for the thread to complete or timeout
sfs_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if sfs_thread.is_alive():
    print("SFS did not complete within the time limit. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print selected features
    sfs_top_features = np.where(sfs_selector.get_support())[0]
    print_selected_features("SFS", sfs_top_features)


SFS did not complete within the time limit. Terminating...


In [15]:
# 7. Genetic Algorithm (DEAP) - Wrapper (Running to Long)
#def evaluate(individual):
#    """ Evaluation function for genetic algorithm """
#    selected_features = [i for i, value in enumerate(individual) if value > 0]
#    if len(selected_features) == 0:
#        return 1000,  # Penalty for selecting no features
#    estimator = RandomForestClassifier()
#    X_selected = X.iloc[:, selected_features]
#    estimator.fit(X_selected, y)
#    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
#creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", np.random.randint, 0, 2)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("evaluate", evaluate)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)

#population = toolbox.population(n=50)
#algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)
#best_individual = tools.selBest(population, k=1)[0]
#genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
#print_selected_features("Genetic Algorithm", genetic_selected_features)

In [16]:
# 7. Genetic Algorithm (DEAP) - Wrapper
def evaluate(individual):
    """Evaluation function for genetic algorithm"""
    selected_features = [i for i, value in enumerate(individual) if value > 0]
    if len(selected_features) == 0:
        return 1000,  # Penalty for selecting no features
    estimator = RandomForestClassifier()
    X_selected = X.iloc[:, selected_features]
    estimator.fit(X_selected, y)
    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Function to run Genetic Algorithm (GA)
def run_ga():
    global population
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running the GA
ga_thread = threading.Thread(target=run_ga)

# Start the thread
ga_thread.start()

# Wait for the thread to complete or timeout
ga_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if ga_thread.is_alive():
    print("Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print the selected features
    best_individual = tools.selBest(population, k=1)[0]
    genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
    print_selected_features("Genetic Algorithm", genetic_selected_features)

Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...


In [17]:
# 8. Boruta Algorithm (Wrapper)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X, y)
boruta_selected_features = np.where(boruta_selector.support_)[0]
print_selected_features("Boruta", boruta_selected_features)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	9 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	10 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	11 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	12 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	13 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	14 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	15 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
Iteration: 	16 / 100
Confirmed: 	45
Tentative: 	2
Rejected: 	8
I

In [18]:
# could not convert string to float for Lasso, Ridge, and Elastic Net so need to change the y variable
# Assuming y is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [19]:
# 9. Lasso (L1 Regularization) - Embedded
lasso = Lasso(alpha=0.01)
lasso.fit(X, y_encoded)
lasso_support = np.where(lasso.coef_ != 0)[0]
print_selected_features("Lasso", lasso_support)


Lasso Selected Features:
1. Feature pslist.nprocs64bit
2. Feature pslist.avg_handlers
3. Feature dlllist.ndlls
4. Feature dlllist.avg_dlls_per_proc
5. Feature handles.nhandles
6. Feature handles.nport
7. Feature handles.nfile
8. Feature handles.nevent
9. Feature handles.ndesktop
10. Feature handles.nkey
11. Feature handles.nthread
12. Feature handles.ndirectory
13. Feature handles.nsemaphore
14. Feature handles.ntimer
15. Feature handles.nsection
16. Feature handles.nmutant
17. Feature ldrmodules.not_in_load
18. Feature ldrmodules.not_in_init
19. Feature malfind.ninjections
20. Feature malfind.commitCharge
21. Feature malfind.uniqueInjections
22. Feature psxview.not_in_eprocess_pool
23. Feature psxview.not_in_csrss_handles
24. Feature svcscan.nservices
25. Feature svcscan.process_services
26. Feature svcscan.interactive_process_services
27. Feature svcscan.nactive


  model = cd_fast.enet_coordinate_descent(


In [20]:
# 10. Ridge (L2 Regularization) - Embedded
ridge = Ridge(alpha=1.0)
ridge.fit(X, y_encoded)
ridge_support = np.where(ridge.coef_ != 0)[0]
print_selected_features("Ridge", ridge_support)


Ridge Selected Features:
1. Feature Category
2. Feature pslist.nproc
3. Feature pslist.nppid
4. Feature pslist.nprocs64bit
5. Feature pslist.avg_handlers
6. Feature dlllist.ndlls
7. Feature dlllist.avg_dlls_per_proc
8. Feature handles.nhandles
9. Feature handles.nport
10. Feature handles.nfile
11. Feature handles.nevent
12. Feature handles.ndesktop
13. Feature handles.nkey
14. Feature handles.nthread
15. Feature handles.ndirectory
16. Feature handles.nsemaphore
17. Feature handles.ntimer
18. Feature handles.nsection
19. Feature handles.nmutant
20. Feature ldrmodules.not_in_load
21. Feature ldrmodules.not_in_init
22. Feature ldrmodules.not_in_mem
23. Feature ldrmodules.not_in_load_avg
24. Feature ldrmodules.not_in_init_avg
25. Feature ldrmodules.not_in_mem_avg
26. Feature malfind.ninjections
27. Feature malfind.commitCharge
28. Feature malfind.protection
29. Feature malfind.uniqueInjections
30. Feature psxview.not_in_pslist
31. Feature psxview.not_in_eprocess_pool
32. Feature psxview.n

In [21]:
# 11. Elastic Net - Embedded
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X, y_encoded)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]
print_selected_features("Elastic Net", elastic_net_support)


Elastic Net Selected Features:
1. Feature pslist.nprocs64bit
2. Feature dlllist.avg_dlls_per_proc
3. Feature handles.nhandles
4. Feature handles.nport
5. Feature handles.nfile
6. Feature handles.ndesktop
7. Feature handles.nkey
8. Feature handles.ndirectory
9. Feature handles.ntimer
10. Feature handles.nsection
11. Feature malfind.ninjections


  model = cd_fast.enet_coordinate_descent(


In [22]:
# 12. Random Forest Feature Importance - Embedded
rf_model = RandomForestClassifier().fit(X, y)
rf_importances = rf_model.feature_importances_
rf_top_features = np.argsort(rf_importances)[::-1][:5]
print_selected_features("Random Forest", rf_top_features, rf_importances)


Random Forest Selected Features:
1. Feature modules.nmodules (Importance: 0.165089)
2. Feature svcscan.nservices (Importance: 0.153588)
3. Feature svcscan.process_services (Importance: 0.151957)
4. Feature handles.nsection (Importance: 0.086837)
5. Feature handles.nfile (Importance: 0.068264)


In [23]:
# 13. ExtraTreesClassifier - Embedded (Already used in your code)
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print_selected_features("ExtraTreesClassifier", index, extratrees.feature_importances_)


ExtraTreesClassifier Selected Features:
1. Feature dlllist.ndlls (Importance: 0.130394)
2. Feature handles.nsection (Importance: 0.082386)
3. Feature handles.nkey (Importance: 0.070852)
4. Feature pslist.avg_handlers (Importance: 0.063311)
5. Feature svcscan.fs_drivers (Importance: 0.062965)
6. Feature svcscan.process_services (Importance: 0.056262)
7. Feature pslist.nppid (Importance: 0.048662)
8. Feature handles.nsemaphore (Importance: 0.045185)
9. Feature handles.nfile (Importance: 0.043966)
10. Feature ldrmodules.not_in_init (Importance: 0.039778)
11. Feature modules.nmodules (Importance: 0.036969)
12. Feature handles.nmutant (Importance: 0.033855)
13. Feature svcscan.nservices (Importance: 0.030114)
14. Feature handles.ndesktop (Importance: 0.024377)
15. Feature handles.nhandles (Importance: 0.024239)
16. Feature pslist.nproc (Importance: 0.021407)
17. Feature ldrmodules.not_in_load (Importance: 0.019714)


In [24]:
# Compare all selected features
print("\nSummary of Selected Features for Each Method:")
for method, selected in features.items():
    print(f"{method}: {selected}")


Summary of Selected Features for Each Method:
Chi-Square: ['handles.avg_handles_per_proc', 'svcscan.shared_process_services', 'pslist.avg_threads', 'handles.nsection', 'handles.nfile']
ANOVA: ['pslist.avg_handlers', 'handles.avg_handles_per_proc', 'handles.nevent', 'handles.nsemaphore', 'pslist.nprocs64bit']
Mutual Information: ['modules.nmodules', 'svcscan.process_services', 'svcscan.nservices', 'dlllist.ndlls', 'handles.nhandles']
Variance Threshold: ['Category', 'pslist.nproc', 'pslist.nppid', 'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls', 'dlllist.avg_dlls_per_proc', 'handles.nhandles', 'handles.nport', 'handles.nfile', 'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread', 'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer', 'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load', 'ldrmodules.not_in_init', 'ldrmodules.not_in_mem_avg', 'malfind.ninjections', 'malfind.commitCharge', 'malfind.protection', 'malfind.uniqueInjections', 