In [1]:
# Library
import numpy as np
import pandas as pd
import threading
from sklearn.feature_selection import (chi2, f_classif, mutual_info_classif, VarianceThreshold, RFE, SequentialFeatureSelector, SelectFromModel)
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from deap import base, creator, tools, algorithms  # For genetic algorithm in feature selection
from boruta import BorutaPy  # For Boruta algorithm
from sklearn.svm import SVR  # Example estimator for RFE and SFS
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
# Input Dataset 

DM1 = pd.read_parquet("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CCCS-CIC-AndMal-2020\\cicandmal2020-dynamic.parquet") #DM--> Dataset Malware 1
DM2 = pd.read_parquet("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CCCS-CIC-AndMal-2020\\cicandmal2020-static.parquet") #DM--> Dataset Malware 2

In [3]:
DM1.info('display.max_columns', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53439 entries, 0 to 53438
Data columns (total 145 columns):
 #    Column                                                                              Dtype 
---   ------                                                                              ----- 
 0    Memory_PssTotal                                                                     int32 
 1    Memory_PssClean                                                                     int32 
 2    Memory_SharedDirty                                                                  int32 
 3    Memory_PrivateDirty                                                                 int32 
 4    Memory_SharedClean                                                                  int32 
 5    Memory_PrivateClean                                                                 int32 
 6    Memory_SwapPssDirty                                                                 int8  
 7    Memory_Heap

In [4]:
#Feature Selection
X = DM1.drop(['Hash', 'Family', 'Category','Label'],axis=1).values    #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = DM1['Label'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Store selected features
features = {}

In [7]:
# Function to print and append selected features
def print_selected_features(method_name, selected_indices, importance=None):
    selected_features = []
    print(f"\n{method_name} Selected Features:")
    for i, idx in enumerate(selected_indices):
        feature_name = DM1.columns[idx]
        if importance is not None:
            print(f"{i + 1}. Feature {feature_name} (Importance: {importance[idx]:.6f})")
        else:
            print(f"{i + 1}. Feature {feature_name}")
        selected_features.append(feature_name)
    features[method_name] = selected_features

In [8]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)

# 1. Chi-Square Test (Filter)
chi2_selector = chi2(X_chi2, y)
chi2_scores = chi2_selector[0]
chi2_top_features = np.argsort(chi2_scores)[::-1][:5]
print_selected_features("Chi-Square", chi2_top_features, chi2_scores)


Chi-Square Selected Features:
1. Feature API_DeviceInfo_android.content.pm.PackageManager_getInstallerPackageName (Importance: nan)
2. Feature API_Process_android.os.Process_start (Importance: nan)
3. Feature API_Database_android.database.sqlite.SQLiteDatabase_create (Importance: nan)
4. Feature API_Network_org.apache.http.impl.client.AbstractHttpClient_execute (Importance: nan)
5. Feature API_DeviceInfo_android.content.pm.PackageManager_getInstalledPackages (Importance: nan)


In [9]:
# 2. ANOVA F-test (Filter)

#Identify constant features (ada Constant Features)
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
X_filtered = X.drop(columns=constant_features)

anova_selector = f_classif(X_filtered, y)
anova_scores = anova_selector[0]
anova_top_features = np.argsort(anova_scores)[::-1][:5]
print_selected_features("ANOVA", anova_top_features, anova_scores)


ANOVA Selected Features:
1. Feature API_DeviceData_android.os.SystemProperties_get (Importance: 1497.637817)
2. Feature API_DeviceData_android.content.ContentResolver_insert (Importance: 681.850159)
3. Feature API_Network_org.apache.http.impl.client.AbstractHttpClient_execute (Importance: 441.402008)
4. Feature API_IPC_android.content.ContextWrapper_registerReceiver (Importance: 441.354523)
5. Feature API_DeviceInfo_android.telephony.TelephonyManager_getSimCountryIso (Importance: 404.332428)


In [10]:
# 3. Mutual Information (Filter)
mutual_info_scores = mutual_info_classif(X, y)
mutual_info_top_features = np.argsort(mutual_info_scores)[::-1][:5]
print_selected_features("Mutual Information", mutual_info_top_features, mutual_info_scores)


Mutual Information Selected Features:
1. Feature Memory_HeapSize (Importance: 0.501720)
2. Feature Network_TotalReceivedBytes (Importance: 0.460811)
3. Feature Network_TotalTransmittedBytes (Importance: 0.361782)
4. Feature Memory_Views (Importance: 0.347938)
5. Feature API_Crypto-Hash_java.security.MessageDigest_update (Importance: 0.328526)


In [11]:
# 4. Variance Threshold (Filter)
vt_selector = VarianceThreshold(threshold=0.1)  # You can adjust the threshold
X_vt = vt_selector.fit_transform(X)
vt_top_features = np.where(vt_selector.get_support())[0]
print_selected_features("Variance Threshold", vt_top_features)


Variance Threshold Selected Features:
1. Feature Memory_PssTotal
2. Feature Memory_PssClean
3. Feature Memory_SharedDirty
4. Feature Memory_PrivateDirty
5. Feature Memory_SharedClean
6. Feature Memory_PrivateClean
7. Feature Memory_HeapSize
8. Feature Memory_HeapAlloc
9. Feature Memory_HeapFree
10. Feature Memory_Views
11. Feature Memory_ViewRootImpl
12. Feature Memory_AppContexts
13. Feature Memory_Activities
14. Feature Memory_Assets
15. Feature Memory_LocalBinders
16. Feature Memory_ProxyBinders
17. Feature Memory_ParcelMemory
18. Feature Memory_ParcelCount
19. Feature Memory_DeathRecipients
20. Feature Memory_OpenSSLSockets
21. Feature Memory_WebViews
22. Feature API_Process_android.app.ActivityManager_killBackgroundProcesses
23. Feature API_Process_android.os.Process_killProcess
24. Feature API_Command_java.lang.Runtime_exec
25. Feature API_Command_java.lang.ProcessBuilder_start
26. Feature API_WebView_android.webkit.WebView_loadUrl
27. Feature API_WebView_android.webkit.WebView_

In [13]:
# 5. Recursive Feature Elimination (RFE) - Wrapper
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)
rfe_top_features = np.where(rfe_selector.support_)[0]
print_selected_features("RFE", rfe_top_features)


RFE Selected Features:
1. Feature Memory_PssClean
2. Feature Memory_SharedDirty
3. Feature Memory_PrivateDirty
4. Feature Memory_SharedClean
5. Feature Memory_HeapAlloc


In [14]:
# 6. Sequential Feature Selector (SFS) - Wrapper (Running too Long)

# Encode the target variable (Y Still contain String)
#label_encoder = LabelEncoder()
#y_encoded = label_encoder.fit_transform(y)

# Sequential Feature Selector
#sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"), n_features_to_select=5)
#X_sfs = sfs_selector.fit_transform(X, y_encoded)
#sfs_top_features = np.where(sfs_selector.get_support())[0]
#print_selected_features("SFS", sfs_top_features)

In [15]:
# 6. Sequential Feature Selector (SFS) - Wrapper

# Function to run SFS
def run_sfs(X, y_encoded):
    global sfs_selector, X_sfs
    sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"),n_features_to_select=5, direction='forward', n_jobs=-1)
    X_sfs = sfs_selector.fit_transform(X, y_encoded)

# Encode the target variable (Y Still contain String)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running SFS
sfs_thread = threading.Thread(target=run_sfs, args=(X, y_encoded))

# Start the thread
sfs_thread.start()

# Wait for the thread to complete or timeout
sfs_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if sfs_thread.is_alive():
    print("SFS did not complete within the time limit. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print selected features
    sfs_top_features = np.where(sfs_selector.get_support())[0]
    print_selected_features("SFS", sfs_top_features)


SFS did not complete within the time limit. Terminating...


In [16]:
# 7. Genetic Algorithm (DEAP) - Wrapper (Running to Long)
#def evaluate(individual):
#    """ Evaluation function for genetic algorithm """
#    selected_features = [i for i, value in enumerate(individual) if value > 0]
#    if len(selected_features) == 0:
#        return 1000,  # Penalty for selecting no features
#    estimator = RandomForestClassifier()
#    X_selected = X.iloc[:, selected_features]
#    estimator.fit(X_selected, y)
#    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
#creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", np.random.randint, 0, 2)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("evaluate", evaluate)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)

#population = toolbox.population(n=50)
#algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)
#best_individual = tools.selBest(population, k=1)[0]
#genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
#print_selected_features("Genetic Algorithm", genetic_selected_features)

In [17]:
# 7. Genetic Algorithm (DEAP) - Wrapper
def evaluate(individual):
    """Evaluation function for genetic algorithm"""
    selected_features = [i for i, value in enumerate(individual) if value > 0]
    if len(selected_features) == 0:
        return 1000,  # Penalty for selecting no features
    estimator = RandomForestClassifier()
    X_selected = X.iloc[:, selected_features]
    estimator.fit(X_selected, y)
    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Function to run Genetic Algorithm (GA)
def run_ga():
    global population
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running the GA
ga_thread = threading.Thread(target=run_ga)

# Start the thread
ga_thread.start()

# Wait for the thread to complete or timeout
ga_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if ga_thread.is_alive():
    print("Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print the selected features
    best_individual = tools.selBest(population, k=1)[0]
    genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
    print_selected_features("Genetic Algorithm", genetic_selected_features)

Exception in thread Thread-6 (run_ga):
Traceback (most recent call last):
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Muhammad Raihan\AppData\Local\Temp\ipykernel_27328\3259204369.py", line 28, in run_ga
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\deap\algorithms.py", line 151, in eaSimple
    for ind, fit in zip(invalid_ind, fitnesses):
  File "C:\Users\Muhammad Raihan\AppData\Local\Temp\ipykernel_27328\3259204369.py", line 10, in evaluate
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^


Genetic Algorithm Selected Features:
1. Feature Memory_PssClean
2. Feature Memory_SharedClean
3. Feature Memory_SwapPssDirty
4. Feature Memory_HeapAlloc
5. Feature Memory_HeapFree
6. Feature Memory_Views
7. Feature Memory_ViewRootImpl
8. Feature Memory_Assets
9. Feature Memory_AssetManagers
10. Feature Memory_LocalBinders
11. Feature Memory_ProxyBinders
12. Feature Memory_DeathRecipients
13. Feature Memory_OpenSSLSockets
14. Feature Memory_WebViews
15. Feature API_Process_android.app.ActivityManager_killBackgroundProcesses
16. Feature API_Process_android.os.Process_killProcess
17. Feature API_Command_java.lang.Runtime_exec
18. Feature API_Command_java.lang.ProcessBuilder_start
19. Feature API_JavaNativeInterface_java.lang.Runtime_loadLibrary
20. Feature API_JavaNativeInterface_java.lang.Runtime_load
21. Feature API_WebView_android.webkit.WebView_loadUrl
22. Feature API_WebView_android.webkit.WebView_loadDataWithBaseURL
23. Feature API_WebView_android.webkit.WebView_addJavascriptInterf

In [18]:
# 8. Boruta Algorithm (Wrapper)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X, y)
boruta_selected_features = np.where(boruta_selector.support_)[0]
print_selected_features("Boruta", boruta_selected_features)

ValueError: Please check your X and y variable. The provided estimator cannot be fitted to your data.
could not allocate 7340032 bytes

In [19]:
# could not convert string to float for Lasso, Ridge, and Elastic Net so need to change the y variable
# Assuming y is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [20]:
# 9. Lasso (L1 Regularization) - Embedded
lasso = Lasso(alpha=0.01)
lasso.fit(X, y_encoded)
lasso_support = np.where(lasso.coef_ != 0)[0]
print_selected_features("Lasso", lasso_support)


Lasso Selected Features:
1. Feature Memory_PssTotal
2. Feature Memory_PssClean
3. Feature Memory_SharedDirty
4. Feature Memory_PrivateDirty
5. Feature Memory_SharedClean
6. Feature Memory_PrivateClean
7. Feature Memory_HeapSize
8. Feature Memory_HeapAlloc
9. Feature Memory_HeapFree
10. Feature Memory_Views
11. Feature Memory_ViewRootImpl
12. Feature Memory_AppContexts
13. Feature Memory_Activities
14. Feature Memory_Assets
15. Feature Memory_LocalBinders
16. Feature Memory_ProxyBinders
17. Feature Memory_ParcelMemory
18. Feature Memory_ParcelCount
19. Feature Memory_DeathRecipients
20. Feature Memory_OpenSSLSockets
21. Feature Memory_WebViews
22. Feature API_Process_android.app.ActivityManager_killBackgroundProcesses
23. Feature API_Process_android.os.Process_killProcess
24. Feature API_Command_java.lang.Runtime_exec
25. Feature API_Command_java.lang.ProcessBuilder_start
26. Feature API_WebView_android.webkit.WebView_loadUrl
27. Feature API_WebView_android.webkit.WebView_loadDataWithB

  model = cd_fast.enet_coordinate_descent(


In [21]:
# 10. Ridge (L2 Regularization) - Embedded
ridge = Ridge(alpha=1.0)
ridge.fit(X, y_encoded)
ridge_support = np.where(ridge.coef_ != 0)[0]
print_selected_features("Ridge", ridge_support)


Ridge Selected Features:
1. Feature Memory_PssTotal
2. Feature Memory_PssClean
3. Feature Memory_SharedDirty
4. Feature Memory_PrivateDirty
5. Feature Memory_SharedClean
6. Feature Memory_PrivateClean
7. Feature Memory_HeapSize
8. Feature Memory_HeapAlloc
9. Feature Memory_HeapFree
10. Feature Memory_Views
11. Feature Memory_ViewRootImpl
12. Feature Memory_AppContexts
13. Feature Memory_Activities
14. Feature Memory_Assets
15. Feature Memory_LocalBinders
16. Feature Memory_ProxyBinders
17. Feature Memory_ParcelMemory
18. Feature Memory_ParcelCount
19. Feature Memory_DeathRecipients
20. Feature Memory_OpenSSLSockets
21. Feature Memory_WebViews
22. Feature API_Process_android.app.ActivityManager_killBackgroundProcesses
23. Feature API_Process_android.os.Process_killProcess
24. Feature API_Command_java.lang.Runtime_exec
25. Feature API_Command_java.lang.ProcessBuilder_start
26. Feature API_JavaNativeInterface_java.lang.Runtime_load
27. Feature API_WebView_android.webkit.WebView_loadUrl
2

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [22]:
# 11. Elastic Net - Embedded
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X, y_encoded)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]
print_selected_features("Elastic Net", elastic_net_support)


Elastic Net Selected Features:
1. Feature Memory_PssTotal
2. Feature Memory_PssClean
3. Feature Memory_SharedDirty
4. Feature Memory_PrivateDirty
5. Feature Memory_SharedClean
6. Feature Memory_PrivateClean
7. Feature Memory_HeapSize
8. Feature Memory_HeapAlloc
9. Feature Memory_HeapFree
10. Feature Memory_Views
11. Feature Memory_ViewRootImpl
12. Feature Memory_LocalBinders
13. Feature Memory_ProxyBinders
14. Feature Memory_ParcelMemory
15. Feature Memory_ParcelCount
16. Feature Memory_OpenSSLSockets
17. Feature API_Process_android.app.ActivityManager_killBackgroundProcesses
18. Feature API_Command_java.lang.Runtime_exec
19. Feature API_WebView_android.webkit.WebView_addJavascriptInterface
20. Feature API_WebView_android.webkit.WebView_evaluateJavascript
21. Feature API_FileIO_libcore.io.IoBridge_open
22. Feature API_FileIO_android.content.ContextWrapper_openFileInput
23. Feature API_FileIO_android.content.ContextWrapper_openFileOutput
24. Feature API_FileIO_android.content.ContextWr

  model = cd_fast.enet_coordinate_descent(


In [23]:
# 12. Random Forest Feature Importance - Embedded
rf_model = RandomForestClassifier().fit(X, y)
rf_importances = rf_model.feature_importances_
rf_top_features = np.argsort(rf_importances)[::-1][:5]
print_selected_features("Random Forest", rf_top_features, rf_importances)


Random Forest Selected Features:
1. Feature Process_total (Importance: 0.042191)
2. Feature Battery_wakelock (Importance: 0.039227)
3. Feature Logcat_total (Importance: 0.024331)
4. Feature Memory_SharedClean (Importance: 0.023574)
5. Feature Memory_HeapAlloc (Importance: 0.020164)


In [24]:
# 13. ExtraTreesClassifier - Embedded (Already used in your code)
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print_selected_features("ExtraTreesClassifier", index, extratrees.feature_importances_)


ExtraTreesClassifier Selected Features:
1. Feature Battery_wakelock (Importance: 0.056942)
2. Feature Process_total (Importance: 0.039842)
3. Feature Memory_SharedClean (Importance: 0.025768)
4. Feature Memory_PrivateClean (Importance: 0.019926)
5. Feature Memory_PssClean (Importance: 0.019705)
6. Feature Logcat_total (Importance: 0.019321)
7. Feature Memory_HeapAlloc (Importance: 0.017398)
8. Feature Memory_SharedDirty (Importance: 0.017084)
9. Feature Memory_HeapSize (Importance: 0.016924)
10. Feature API_DeviceData_android.os.SystemProperties_get (Importance: 0.016046)
11. Feature Memory_PssTotal (Importance: 0.016033)
12. Feature API__sessions (Importance: 0.015882)
13. Feature API_Binder_android.app.ActivityThread_handleReceiver (Importance: 0.015794)
14. Feature Memory_PrivateDirty (Importance: 0.015624)
15. Feature API_Binder_android.app.Activity_startActivity (Importance: 0.015054)
16. Feature Battery_service (Importance: 0.014536)
17. Feature API_DeviceData_android.content.Co

In [25]:
# Compare all selected features
print("\nSummary of Selected Features for Each Method:")
for method, selected in features.items():
    print(f"{method}: {selected}")


Summary of Selected Features for Each Method:
Chi-Square: ['API_DeviceInfo_android.content.pm.PackageManager_getInstallerPackageName', 'API_Process_android.os.Process_start', 'API_Database_android.database.sqlite.SQLiteDatabase_create', 'API_Network_org.apache.http.impl.client.AbstractHttpClient_execute', 'API_DeviceInfo_android.content.pm.PackageManager_getInstalledPackages']
ANOVA: ['API_DeviceData_android.os.SystemProperties_get', 'API_DeviceData_android.content.ContentResolver_insert', 'API_Network_org.apache.http.impl.client.AbstractHttpClient_execute', 'API_IPC_android.content.ContextWrapper_registerReceiver', 'API_DeviceInfo_android.telephony.TelephonyManager_getSimCountryIso']
Mutual Information: ['Memory_HeapSize', 'Network_TotalReceivedBytes', 'Network_TotalTransmittedBytes', 'Memory_Views', 'API_Crypto-Hash_java.security.MessageDigest_update']
RFE: ['Memory_PssClean', 'Memory_SharedDirty', 'Memory_PrivateDirty', 'Memory_SharedClean', 'Memory_HeapAlloc']
Random Forest: ['Pro

In [26]:
DM2.info('display.max_columns', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357805 entries, 0 to 357804
Data columns (total 9505 columns):
 #     Column  Dtype  
---    ------  -----  
 0     F0      object 
 1     F1      int16  
 2     F2      int16  
 3     F3      int16  
 4     F4      int16  
 5     F5      int8   
 6     F6      int8   
 7     F7      int8   
 8     F8      int8   
 9     F9      int8   
 10    F10     int8   
 11    F11     int8   
 12    F12     int8   
 13    F13     int8   
 14    F14     int8   
 15    F15     int8   
 16    F16     int8   
 17    F17     int8   
 18    F18     int8   
 19    F19     int8   
 20    F20     int8   
 21    F21     int8   
 22    F22     int8   
 23    F23     int8   
 24    F24     int8   
 25    F25     int8   
 26    F26     int8   
 27    F27     int8   
 28    F28     int8   
 29    F29     int8   
 30    F30     int8   
 31    F31     int8   
 32    F32     int8   
 33    F33     int8   
 34    F34     int8   
 35    F35     int8   
 36    F36   

In [27]:
#Feature Selection
X = DM2.drop(['Hash', 'Family', 'Category','Label'],axis=1).values    #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = DM2['Label'].values

KeyError: "['Hash', 'Family', 'Category'] not found in axis"

In [28]:
# Store selected features
features = {}

In [29]:
# Function to print and append selected features
def print_selected_features(method_name, selected_indices, importance=None):
    selected_features = []
    print(f"\n{method_name} Selected Features:")
    for i, idx in enumerate(selected_indices):
        feature_name = DM2.columns[idx]
        if importance is not None:
            print(f"{i + 1}. Feature {feature_name} (Importance: {importance[idx]:.6f})")
        else:
            print(f"{i + 1}. Feature {feature_name}")
        selected_features.append(feature_name)
    features[method_name] = selected_features

In [30]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)

# 1. Chi-Square Test (Filter)
chi2_selector = chi2(X_chi2, y)
chi2_scores = chi2_selector[0]
chi2_top_features = np.argsort(chi2_scores)[::-1][:5]
print_selected_features("Chi-Square", chi2_top_features, chi2_scores)


Chi-Square Selected Features:
1. Feature F92 (Importance: nan)
2. Feature F23 (Importance: nan)
3. Feature F63 (Importance: nan)
4. Feature F97 (Importance: nan)
5. Feature F95 (Importance: nan)


In [31]:
# 2. ANOVA F-test (Filter)
anova_selector = f_classif(X, y)
anova_scores = anova_selector[0]
anova_top_features = np.argsort(anova_scores)[::-1][:5]
print_selected_features("ANOVA", anova_top_features, anova_scores)


ANOVA Selected Features:
1. Feature F38 (Importance: nan)
2. Feature F63 (Importance: nan)
3. Feature F99 (Importance: nan)
4. Feature F6 (Importance: nan)
5. Feature F15 (Importance: nan)


  f = msb / msw


In [32]:
# 3. Mutual Information (Filter)
mutual_info_scores = mutual_info_classif(X, y)
mutual_info_top_features = np.argsort(mutual_info_scores)[::-1][:5]
print_selected_features("Mutual Information", mutual_info_top_features, mutual_info_scores)


Mutual Information Selected Features:
1. Feature F7 (Importance: 0.502548)
2. Feature F128 (Importance: 0.461379)
3. Feature F130 (Importance: 0.364948)
4. Feature F10 (Importance: 0.341874)
5. Feature F76 (Importance: 0.324805)


In [33]:
# 4. Variance Threshold (Filter)
vt_selector = VarianceThreshold(threshold=0.1)  # You can adjust the threshold
X_vt = vt_selector.fit_transform(X)
vt_top_features = np.where(vt_selector.get_support())[0]
print_selected_features("Variance Threshold", vt_top_features)


Variance Threshold Selected Features:
1. Feature F0
2. Feature F1
3. Feature F2
4. Feature F3
5. Feature F4
6. Feature F5
7. Feature F7
8. Feature F8
9. Feature F9
10. Feature F10
11. Feature F11
12. Feature F12
13. Feature F13
14. Feature F14
15. Feature F16
16. Feature F17
17. Feature F18
18. Feature F19
19. Feature F20
20. Feature F21
21. Feature F22
22. Feature F24
23. Feature F25
24. Feature F26
25. Feature F27
26. Feature F30
27. Feature F31
28. Feature F32
29. Feature F33
30. Feature F34
31. Feature F40
32. Feature F41
33. Feature F42
34. Feature F43
35. Feature F44
36. Feature F45
37. Feature F48
38. Feature F50
39. Feature F51
40. Feature F52
41. Feature F53
42. Feature F54
43. Feature F55
44. Feature F56
45. Feature F57
46. Feature F58
47. Feature F59
48. Feature F60
49. Feature F61
50. Feature F62
51. Feature F64
52. Feature F66
53. Feature F67
54. Feature F68
55. Feature F69
56. Feature F70
57. Feature F71
58. Feature F72
59. Feature F73
60. Feature F74
61. Feature F75
62.

In [34]:
# 5. Recursive Feature Elimination (RFE) - Wrapper
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)
rfe_top_features = np.where(rfe_selector.support_)[0]
print_selected_features("RFE", rfe_top_features)


RFE Selected Features:
1. Feature F2
2. Feature F3
3. Feature F4
4. Feature F5
5. Feature F8


In [35]:
# 6. Sequential Feature Selector (SFS) - Wrapper (Running too Long)

# Encode the target variable (Y Still contain String)
#label_encoder = LabelEncoder()
#y_encoded = label_encoder.fit_transform(y)

# Sequential Feature Selector
#sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"), n_features_to_select=5)
#X_sfs = sfs_selector.fit_transform(X, y_encoded)
#sfs_top_features = np.where(sfs_selector.get_support())[0]
#print_selected_features("SFS", sfs_top_features)

In [36]:
# 6. Sequential Feature Selector (SFS) - Wrapper

# Function to run SFS
def run_sfs(X, y_encoded):
    global sfs_selector, X_sfs
    sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"),n_features_to_select=5, direction='forward', n_jobs=-1)
    X_sfs = sfs_selector.fit_transform(X, y_encoded)

# Encode the target variable (Y Still contain String)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running SFS
sfs_thread = threading.Thread(target=run_sfs, args=(X, y_encoded))

# Start the thread
sfs_thread.start()

# Wait for the thread to complete or timeout
sfs_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if sfs_thread.is_alive():
    print("SFS did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print selected features
    sfs_top_features = np.where(sfs_selector.get_support())[0]
    print_selected_features("SFS", sfs_top_features)

SFS did not complete within the time limit of 10 minutes. Terminating...


In [37]:
# 7. Genetic Algorithm (DEAP) - Wrapper (Running to Long)
#def evaluate(individual):
#    """ Evaluation function for genetic algorithm """
#    selected_features = [i for i, value in enumerate(individual) if value > 0]
#    if len(selected_features) == 0:
#        return 1000,  # Penalty for selecting no features
#    estimator = RandomForestClassifier()
#    X_selected = X.iloc[:, selected_features]
#    estimator.fit(X_selected, y)
#    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
#creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", np.random.randint, 0, 2)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("evaluate", evaluate)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)

#population = toolbox.population(n=50)
#algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)
#best_individual = tools.selBest(population, k=1)[0]
#genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
#print_selected_features("Genetic Algorithm", genetic_selected_features)


In [38]:
# 7. Genetic Algorithm (DEAP) - Wrapper
def evaluate(individual):
    """Evaluation function for genetic algorithm"""
    selected_features = [i for i, value in enumerate(individual) if value > 0]
    if len(selected_features) == 0:
        return 1000,  # Penalty for selecting no features
    estimator = RandomForestClassifier()
    X_selected = X.iloc[:, selected_features]
    estimator.fit(X_selected, y)
    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Function to run Genetic Algorithm (GA)
def run_ga():
    global population
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running the GA
ga_thread = threading.Thread(target=run_ga)

# Start the thread
ga_thread.start()

# Wait for the thread to complete or timeout
ga_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if ga_thread.is_alive():
    print("Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print the selected features
    best_individual = tools.selBest(population, k=1)[0]
    genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
    print_selected_features("Genetic Algorithm", genetic_selected_features)



Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...


In [39]:
# 8. Boruta Algorithm (Wrapper)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X, y)
boruta_selected_features = np.where(boruta_selector.support_)[0]
print_selected_features("Boruta", boruta_selected_features)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	141
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	48
Tentative: 	4
Rejected: 	89
Iteration: 	9 / 100
Confirmed: 	48
Tentative: 	4
Rejected: 	89
Iteration: 	10 / 100
Confirmed: 	48
Tentative: 	4
Rejected: 	89
Iteration: 	11 / 100
Confirmed: 	48
Tentative: 	4
Rejected: 	89
Iteration: 	12 / 100
Confirmed: 	50
Tentative: 	2
Rejected: 	89
Iteration: 	13 / 100
Confirmed: 	50
Tentative: 	2
Rejected: 	89
Iteration: 	14 / 100
Confirmed: 	50
Tentative: 	2
Rejected: 	89
Iteration: 	15 / 100
Confirmed: 	50
Tentative: 	2
Rejected: 	89
Iteration: 	16 / 100
Confirmed: 	51
Tentative: 	1

In [40]:
# could not convert string to float for Lasso, Ridge, and Elastic Net so need to change the y variable
# Assuming y is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [41]:
# 9. Lasso (L1 Regularization) - Embedded
lasso = Lasso(alpha=0.01)
lasso.fit(X, y_encoded)
lasso_support = np.where(lasso.coef_ != 0)[0]
print_selected_features("Lasso", lasso_support)


Lasso Selected Features:
1. Feature F0
2. Feature F1
3. Feature F2
4. Feature F3
5. Feature F4
6. Feature F5
7. Feature F7
8. Feature F8
9. Feature F9
10. Feature F10
11. Feature F11
12. Feature F12
13. Feature F13
14. Feature F14
15. Feature F16
16. Feature F17
17. Feature F18
18. Feature F19
19. Feature F20
20. Feature F21
21. Feature F22
22. Feature F24
23. Feature F25
24. Feature F26
25. Feature F27
26. Feature F30
27. Feature F32
28. Feature F33
29. Feature F34
30. Feature F40
31. Feature F41
32. Feature F42
33. Feature F43
34. Feature F44
35. Feature F45
36. Feature F46
37. Feature F48
38. Feature F50
39. Feature F51
40. Feature F52
41. Feature F53
42. Feature F54
43. Feature F55
44. Feature F56
45. Feature F57
46. Feature F58
47. Feature F59
48. Feature F60
49. Feature F61
50. Feature F62
51. Feature F64
52. Feature F66
53. Feature F67
54. Feature F68
55. Feature F69
56. Feature F70
57. Feature F71
58. Feature F72
59. Feature F73
60. Feature F74
61. Feature F75
62. Feature F76


  model = cd_fast.enet_coordinate_descent(


In [42]:
# 10. Ridge (L2 Regularization) - Embedded
ridge = Ridge(alpha=1.0)
ridge.fit(X, y_encoded)
ridge_support = np.where(ridge.coef_ != 0)[0]
print_selected_features("Ridge", ridge_support)


Ridge Selected Features:
1. Feature F0
2. Feature F1
3. Feature F2
4. Feature F3
5. Feature F4
6. Feature F5
7. Feature F7
8. Feature F8
9. Feature F9
10. Feature F10
11. Feature F11
12. Feature F12
13. Feature F13
14. Feature F14
15. Feature F16
16. Feature F17
17. Feature F18
18. Feature F19
19. Feature F20
20. Feature F21
21. Feature F22
22. Feature F24
23. Feature F25
24. Feature F26
25. Feature F27
26. Feature F29
27. Feature F30
28. Feature F31
29. Feature F32
30. Feature F33
31. Feature F34
32. Feature F35
33. Feature F40
34. Feature F41
35. Feature F42
36. Feature F43
37. Feature F44
38. Feature F45
39. Feature F46
40. Feature F47
41. Feature F48
42. Feature F49
43. Feature F50
44. Feature F51
45. Feature F52
46. Feature F53
47. Feature F54
48. Feature F55
49. Feature F56
50. Feature F57
51. Feature F58
52. Feature F59
53. Feature F60
54. Feature F61
55. Feature F62
56. Feature F64
57. Feature F65
58. Feature F66
59. Feature F67
60. Feature F68
61. Feature F69
62. Feature F70


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [43]:
# 11. Elastic Net - Embedded
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X, y_encoded)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]
print_selected_features("Elastic Net", elastic_net_support)


Elastic Net Selected Features:
1. Feature F0
2. Feature F1
3. Feature F2
4. Feature F3
5. Feature F4
6. Feature F5
7. Feature F7
8. Feature F8
9. Feature F9
10. Feature F10
11. Feature F11
12. Feature F16
13. Feature F17
14. Feature F18
15. Feature F19
16. Feature F21
17. Feature F24
18. Feature F26
19. Feature F33
20. Feature F34
21. Feature F41
22. Feature F42
23. Feature F43
24. Feature F44
25. Feature F48
26. Feature F50
27. Feature F51
28. Feature F52
29. Feature F53
30. Feature F56
31. Feature F58
32. Feature F59
33. Feature F60
34. Feature F61
35. Feature F64
36. Feature F66
37. Feature F67
38. Feature F68
39. Feature F69
40. Feature F70
41. Feature F71
42. Feature F72
43. Feature F73
44. Feature F74
45. Feature F75
46. Feature F76
47. Feature F77
48. Feature F78
49. Feature F79
50. Feature F82
51. Feature F83
52. Feature F84
53. Feature F88
54. Feature F89
55. Feature F91
56. Feature F96
57. Feature F98
58. Feature F100
59. Feature F102
60. Feature F103
61. Feature F104
62. Fe

  model = cd_fast.enet_coordinate_descent(


In [44]:
# 12. Random Forest Feature Importance - Embedded
rf_model = RandomForestClassifier().fit(X, y)
rf_importances = rf_model.feature_importances_
rf_top_features = np.argsort(rf_importances)[::-1][:5]
print_selected_features("Random Forest", rf_top_features, rf_importances)


Random Forest Selected Features:
1. Feature F140 (Importance: 0.042680)
2. Feature F132 (Importance: 0.039407)
3. Feature F139 (Importance: 0.023495)
4. Feature F4 (Importance: 0.023476)
5. Feature F8 (Importance: 0.021292)


In [45]:
# 13. ExtraTreesClassifier - Embedded (Already used in your code)
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print_selected_features("ExtraTreesClassifier", index, extratrees.feature_importances_)


ExtraTreesClassifier Selected Features:
1. Feature F132 (Importance: 0.055655)
2. Feature F140 (Importance: 0.039951)
3. Feature F4 (Importance: 0.025627)
4. Feature F1 (Importance: 0.020466)
5. Feature F5 (Importance: 0.020362)
6. Feature F139 (Importance: 0.019542)
7. Feature F8 (Importance: 0.018300)
8. Feature F2 (Importance: 0.017107)
9. Feature F7 (Importance: 0.016837)
10. Feature F0 (Importance: 0.016437)
11. Feature F127 (Importance: 0.016246)
12. Feature F125 (Importance: 0.016058)
13. Feature F3 (Importance: 0.015432)
14. Feature F72 (Importance: 0.015099)
15. Feature F71 (Importance: 0.014716)
16. Feature F9 (Importance: 0.014057)
17. Feature F19 (Importance: 0.014038)
18. Feature F133 (Importance: 0.014011)
19. Feature F12 (Importance: 0.013290)
20. Feature F48 (Importance: 0.013284)
21. Feature F18 (Importance: 0.013248)
22. Feature F17 (Importance: 0.013113)
23. Feature F69 (Importance: 0.013096)
24. Feature F116 (Importance: 0.013089)
25. Feature F134 (Importance: 0.01

In [46]:
# Compare all selected features
print("\nSummary of Selected Features for Each Method:")
for method, selected in features.items():
    print(f"{method}: {selected}")


Summary of Selected Features for Each Method:
Chi-Square: ['F92', 'F23', 'F63', 'F97', 'F95']
ANOVA: ['F38', 'F63', 'F99', 'F6', 'F15']
Mutual Information: ['F7', 'F128', 'F130', 'F10', 'F76']
Variance Threshold: ['F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F24', 'F25', 'F26', 'F27', 'F30', 'F31', 'F32', 'F33', 'F34', 'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F48', 'F50', 'F51', 'F52', 'F53', 'F54', 'F55', 'F56', 'F57', 'F58', 'F59', 'F60', 'F61', 'F62', 'F64', 'F66', 'F67', 'F68', 'F69', 'F70', 'F71', 'F72', 'F73', 'F74', 'F75', 'F76', 'F77', 'F78', 'F79', 'F80', 'F81', 'F82', 'F83', 'F84', 'F85', 'F86', 'F87', 'F88', 'F89', 'F90', 'F91', 'F96', 'F98', 'F100', 'F101', 'F102', 'F103', 'F104', 'F105', 'F106', 'F107', 'F108', 'F109', 'F110', 'F111', 'F113', 'F114', 'F115', 'F116', 'F117', 'F118', 'F119', 'F120', 'F121', 'F122', 'F123', 'F125', 'F126', 'F127', 'F128', 'F129', 'F130', 'F131', '