In [1]:
# Library
import numpy as np
import pandas as pd
import threading
from sklearn.feature_selection import (chi2, f_classif, mutual_info_classif, VarianceThreshold, RFE, SequentialFeatureSelector, SelectFromModel)
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from deap import base, creator, tools, algorithms  # For genetic algorithm in feature selection
from boruta import BorutaPy  # For Boruta algorithm
from sklearn.svm import SVR  # Example estimator for RFE and SFS
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
# Input Dataset 

DM = pd.read_csv("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CIC-PDFMal2022\\PDFMalware2022.csv") #DM--> Dataset Malware

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10026 entries, 0 to 10025
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fine name         10026 non-null  object 
 1   pdfsize           10025 non-null  float64
 2   metadata size     10025 non-null  float64
 3   pages             10025 non-null  float64
 4   xref Length       10025 non-null  float64
 5   title characters  10025 non-null  float64
 6   isEncrypted       10025 non-null  float64
 7   embedded files    10025 non-null  float64
 8   images            10025 non-null  object 
 9   text              10025 non-null  object 
 10  header            10025 non-null  object 
 11  obj               10023 non-null  object 
 12  endobj            10023 non-null  object 
 13  stream            10023 non-null  float64
 14  endstream         10023 non-null  object 
 15  xref              10023 non-null  object 
 16  trailer           10023 non-null  float6

In [4]:
#Feature Selection
X = DM.drop(['Fine name','images','text','header','obj','endobj','endstream','xref','startxref','pageno','JS','Javascript','AA','OpenAction','Acroform','JBIG2Decode','RichMedia','launch','EmbeddedFile','XFA','Class'],axis=1).values    #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = DM['Class'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Store selected features
features = {}

In [7]:
# Function to print and append selected features
def print_selected_features(method_name, selected_indices, importance=None):
    selected_features = []
    print(f"\n{method_name} Selected Features:")
    for i, idx in enumerate(selected_indices):
        feature_name = DM.columns[idx]
        if importance is not None:
            print(f"{i + 1}. Feature {feature_name} (Importance: {importance[idx]:.6f})")
        else:
            print(f"{i + 1}. Feature {feature_name}")
        selected_features.append(feature_name)
    features[method_name] = selected_features

In [8]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)

# 1. Chi-Square Test (Filter)
chi2_selector = chi2(X_chi2, y)
chi2_scores = chi2_selector[0]
chi2_top_features = np.argsort(chi2_scores)[::-1][:5]
print_selected_features("Chi-Square", chi2_top_features, chi2_scores)


Chi-Square Selected Features:
1. Feature embedded files (Importance: 133.432818)
2. Feature pages (Importance: 69.388846)
3. Feature header (Importance: 13.238948)
4. Feature images (Importance: 9.111338)
5. Feature metadata size (Importance: 8.704534)


In [9]:
# 2. ANOVA F-test (Filter)
anova_selector = f_classif(X, y)
anova_scores = anova_selector[0]
anova_top_features = np.argsort(anova_scores)[::-1][:5]
print_selected_features("ANOVA", anova_top_features, anova_scores)


ANOVA Selected Features:
1. Feature embedded files (Importance: 1895.294585)
2. Feature images (Importance: 528.887180)
3. Feature header (Importance: 355.830508)
4. Feature text (Importance: 294.038792)
5. Feature title characters (Importance: 170.265500)


In [10]:
# 3. Mutual Information (Filter)
mutual_info_scores = mutual_info_classif(X, y)
mutual_info_top_features = np.argsort(mutual_info_scores)[::-1][:5]
print_selected_features("Mutual Information", mutual_info_top_features, mutual_info_scores)


Mutual Information Selected Features:
1. Feature pdfsize (Importance: 0.416916)
2. Feature pages (Importance: 0.400755)
3. Feature Fine name (Importance: 0.328007)
4. Feature embedded files (Importance: 0.284176)
5. Feature images (Importance: 0.249273)


In [11]:
# 4. Variance Threshold (Filter)
vt_selector = VarianceThreshold(threshold=0.1)  # You can adjust the threshold
X_vt = vt_selector.fit_transform(X)
vt_top_features = np.where(vt_selector.get_support())[0]
print_selected_features("Variance Threshold", vt_top_features)


Variance Threshold Selected Features:
1. Feature Fine name
2. Feature pdfsize
3. Feature metadata size
4. Feature pages
5. Feature xref Length
6. Feature embedded files
7. Feature images
8. Feature header
9. Feature obj


In [12]:
# 5. Recursive Feature Elimination (RFE) - Wrapper
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)
rfe_top_features = np.where(rfe_selector.support_)[0]
print_selected_features("RFE", rfe_top_features)


RFE Selected Features:
1. Feature Fine name
2. Feature pdfsize
3. Feature pages
4. Feature embedded files
5. Feature images


In [13]:
# 6. Sequential Feature Selector (SFS) - Wrapper (Running too Long)

# Encode the target variable (Y Still contain String)
#label_encoder = LabelEncoder()
#y_encoded = label_encoder.fit_transform(y)

# Sequential Feature Selector
#sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"), n_features_to_select=5)
#X_sfs = sfs_selector.fit_transform(X, y_encoded)
#sfs_top_features = np.where(sfs_selector.get_support())[0]
#print_selected_features("SFS", sfs_top_features)

In [14]:
# 6. Sequential Feature Selector (SFS) - Wrapper

# Function to run SFS
def run_sfs(X, y_encoded):
    global sfs_selector, X_sfs
    sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"),n_features_to_select=5, direction='forward', n_jobs=-1)
    X_sfs = sfs_selector.fit_transform(X, y_encoded)

# Encode the target variable (Y Still contain String)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running SFS
sfs_thread = threading.Thread(target=run_sfs, args=(X, y_encoded))

# Start the thread
sfs_thread.start()

# Wait for the thread to complete or timeout
sfs_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if sfs_thread.is_alive():
    print("SFS did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print selected features
    sfs_top_features = np.where(sfs_selector.get_support())[0]
    print_selected_features("SFS", sfs_top_features)

SFS did not complete within the time limit of 10 minutes. Terminating...


In [15]:
# 7. Genetic Algorithm (DEAP) - Wrapper (Running to Long)
#def evaluate(individual):
#    """ Evaluation function for genetic algorithm """
#    selected_features = [i for i, value in enumerate(individual) if value > 0]
#    if len(selected_features) == 0:
#        return 1000,  # Penalty for selecting no features
#    estimator = RandomForestClassifier()
#    X_selected = X.iloc[:, selected_features]
#    estimator.fit(X_selected, y)
#    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
#creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", np.random.randint, 0, 2)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("evaluate", evaluate)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)

#population = toolbox.population(n=50)
#algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)
#best_individual = tools.selBest(population, k=1)[0]
#genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
#print_selected_features("Genetic Algorithm", genetic_selected_features)


In [16]:
# 7. Genetic Algorithm (DEAP) - Wrapper
def evaluate(individual):
    """Evaluation function for genetic algorithm"""
    selected_features = [i for i, value in enumerate(individual) if value > 0]
    if len(selected_features) == 0:
        return 1000,  # Penalty for selecting no features
    estimator = RandomForestClassifier()
    X_selected = X.iloc[:, selected_features]
    estimator.fit(X_selected, y)
    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Function to run Genetic Algorithm (GA)
def run_ga():
    global population
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running the GA
ga_thread = threading.Thread(target=run_ga)

# Start the thread
ga_thread.start()

# Wait for the thread to complete or timeout
ga_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if ga_thread.is_alive():
    print("Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print the selected features
    best_individual = tools.selBest(population, k=1)[0]
    genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
    print_selected_features("Genetic Algorithm", genetic_selected_features)

Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...


In [17]:
# 8. Boruta Algorithm (Wrapper)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X, y)
boruta_selected_features = np.where(boruta_selector.support_)[0]
print_selected_features("Boruta", boruta_selected_features)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	9 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	13 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	14 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	15 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration: 	16 / 100
Confirmed: 	9
Tentative: 	2
Rejected: 	1
Iteration:

In [18]:
# could not convert string to float for Lasso, Ridge, and Elastic Net so need to change the y variable
# Assuming y is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [19]:
# 9. Lasso (L1 Regularization) - Embedded
lasso = Lasso(alpha=0.01)
lasso.fit(X, y_encoded)
lasso_support = np.where(lasso.coef_ != 0)[0]
print_selected_features("Lasso", lasso_support)


Lasso Selected Features:
1. Feature Fine name
2. Feature pdfsize
3. Feature metadata size
4. Feature pages
5. Feature xref Length
6. Feature title characters
7. Feature embedded files
8. Feature images
9. Feature header
10. Feature obj


In [20]:
# 10. Ridge (L2 Regularization) - Embedded
ridge = Ridge(alpha=1.0)
ridge.fit(X, y_encoded)
ridge_support = np.where(ridge.coef_ != 0)[0]
print_selected_features("Ridge", ridge_support)


Ridge Selected Features:
1. Feature Fine name
2. Feature pdfsize
3. Feature metadata size
4. Feature pages
5. Feature xref Length
6. Feature title characters
7. Feature isEncrypted
8. Feature embedded files
9. Feature images
10. Feature text
11. Feature header
12. Feature obj


In [21]:
# 11. Elastic Net - Embedded
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X, y_encoded)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]
print_selected_features("Elastic Net", elastic_net_support)


Elastic Net Selected Features:
1. Feature Fine name
2. Feature pdfsize
3. Feature metadata size
4. Feature pages
5. Feature xref Length
6. Feature embedded files


In [22]:
# 12. Random Forest Feature Importance - Embedded
rf_model = RandomForestClassifier().fit(X, y)
rf_importances = rf_model.feature_importances_
rf_top_features = np.argsort(rf_importances)[::-1][:5]
print_selected_features("Random Forest", rf_top_features, rf_importances)


Random Forest Selected Features:
1. Feature pdfsize (Importance: 0.250249)
2. Feature embedded files (Importance: 0.221474)
3. Feature Fine name (Importance: 0.182588)
4. Feature pages (Importance: 0.112480)
5. Feature images (Importance: 0.083472)


In [23]:
# 13. ExtraTreesClassifier - Embedded (Already used in your code)
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print_selected_features("ExtraTreesClassifier", index, extratrees.feature_importances_)


ExtraTreesClassifier Selected Features:
1. Feature embedded files (Importance: 0.225095)
2. Feature images (Importance: 0.194418)
3. Feature pdfsize (Importance: 0.174011)
4. Feature Fine name (Importance: 0.109045)
5. Feature pages (Importance: 0.090397)


In [24]:
# Compare all selected features
print("\nSummary of Selected Features for Each Method:")
for method, selected in features.items():
    print(f"{method}: {selected}")


Summary of Selected Features for Each Method:
Chi-Square: ['embedded files', 'pages', 'header', 'images', 'metadata size']
ANOVA: ['embedded files', 'images', 'header', 'text', 'title characters']
Mutual Information: ['pdfsize', 'pages', 'Fine name', 'embedded files', 'images']
Variance Threshold: ['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length', 'embedded files', 'images', 'header', 'obj']
RFE: ['Fine name', 'pdfsize', 'pages', 'embedded files', 'images']
Boruta: ['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length', 'embedded files', 'images', 'header', 'obj']
Lasso: ['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length', 'title characters', 'embedded files', 'images', 'header', 'obj']
Ridge: ['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length', 'title characters', 'isEncrypted', 'embedded files', 'images', 'text', 'header', 'obj']
Elastic Net: ['Fine name', 'pdfsize', 'metadata size', 'pages', 'xref Length', 'embedded files']
Random 