In [1]:
# Library
import numpy as np
import pandas as pd
import threading
import os
from sklearn.feature_selection import (chi2, f_classif, mutual_info_classif, VarianceThreshold, RFE, SequentialFeatureSelector, SelectFromModel)
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from deap import base, creator, tools, algorithms  # For genetic algorithm in feature selection
from boruta import BorutaPy  # For Boruta algorithm
from sklearn.svm import SVR  # Example estimator for RFE and SFS
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
# Input Dataset 

# Define the directory where the parquet files are stored
parquet_directory = "C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CTU-13"

# List all parquet files in the directory
parquet_files = [f for f in os.listdir(parquet_directory) if f.endswith('.parquet')]

# Read each parquet file and append it to a list of DataFrames
dataframes = [pd.read_parquet(os.path.join(parquet_directory, file)) for file in parquet_files]

# Concatenate all DataFrames into a single DataFrame
DM = pd.concat(dataframes, ignore_index=True)

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10598771 entries, 0 to 10598770
Data columns (total 11 columns):
 #   Column     Dtype  
---  ------     -----  
 0   dur        float32
 1   proto      object 
 2   dir        object 
 3   state      object 
 4   stos       float32
 5   dtos       float32
 6   tot_pkts   int32  
 7   tot_bytes  int64  
 8   src_bytes  int64  
 9   label      object 
 10  Family     object 
dtypes: float32(3), int32(1), int64(2), object(5)
memory usage: 727.8+ MB


In [4]:
#Feature Selection
X = DM.drop(['proto', 'dir', 'state', 'label','Family'],axis=1).values    #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = DM['label'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Store selected features
features = {}

In [7]:
# Function to print and append selected features
def print_selected_features(method_name, selected_indices, importance=None):
    selected_features = []
    print(f"\n{method_name} Selected Features:")
    for i, idx in enumerate(selected_indices):
        feature_name = DM.columns[idx]
        if importance is not None:
            print(f"{i + 1}. Feature {feature_name} (Importance: {importance[idx]:.6f})")
        else:
            print(f"{i + 1}. Feature {feature_name}")
        selected_features.append(feature_name)
    features[method_name] = selected_features

In [8]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_chi2 = scaler.fit_transform(X)

# 1. Chi-Square Test (Filter)
chi2_selector = chi2(X_chi2, y)
chi2_scores = chi2_selector[0]
chi2_top_features = np.argsort(chi2_scores)[::-1][:5]
print_selected_features("Chi-Square", chi2_top_features, chi2_scores)


Chi-Square Selected Features:
1. Feature dur (Importance: 1391861.960142)
2. Feature proto (Importance: 10085.538096)
3. Feature dir (Importance: 1671.412729)
4. Feature stos (Importance: 476.124774)
5. Feature dtos (Importance: 251.282248)


In [9]:
# 2. ANOVA F-test (Filter)
anova_selector = f_classif(X, y)
anova_scores = anova_selector[0]
anova_top_features = np.argsort(anova_scores)[::-1][:5]
print_selected_features("ANOVA", anova_top_features, anova_scores)


ANOVA Selected Features:
1. Feature dur (Importance: 2395.922086)
2. Feature proto (Importance: 9.228532)
3. Feature stos (Importance: 3.175797)
4. Feature state (Importance: 2.142683)
5. Feature dir (Importance: 1.478296)


In [10]:
# 3. Mutual Information (Filter)
mutual_info_scores = mutual_info_classif(X, y)
mutual_info_top_features = np.argsort(mutual_info_scores)[::-1][:5]
print_selected_features("Mutual Information", mutual_info_top_features, mutual_info_scores)


Mutual Information Selected Features:
1. Feature stos (Importance: 0.782502)
2. Feature dtos (Importance: 0.765841)
3. Feature dur (Importance: 0.667200)
4. Feature state (Importance: 0.549713)
5. Feature proto (Importance: 0.000696)


In [11]:
# 4. Variance Threshold (Filter)
vt_selector = VarianceThreshold(threshold=0.1)  # You can adjust the threshold
X_vt = vt_selector.fit_transform(X)
vt_top_features = np.where(vt_selector.get_support())[0]
print_selected_features("Variance Threshold", vt_top_features)


Variance Threshold Selected Features:
1. Feature dur
2. Feature state
3. Feature stos
4. Feature dtos


In [12]:
# 5. Recursive Feature Elimination (RFE) - Wrapper
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)
rfe_top_features = np.where(rfe_selector.support_)[0]
print_selected_features("RFE", rfe_top_features)

MemoryError: could not allocate 22028484608 bytes

In [15]:
# 6. Sequential Feature Selector (SFS) - Wrapper (Running too Long)

# Encode the target variable (Y Still contain String)
#label_encoder = LabelEncoder()
#y_encoded = label_encoder.fit_transform(y)

# Sequential Feature Selector
#sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"), n_features_to_select=5)
#X_sfs = sfs_selector.fit_transform(X, y_encoded)
#sfs_top_features = np.where(sfs_selector.get_support())[0]
#print_selected_features("SFS", sfs_top_features)

In [16]:
# 6. Sequential Feature Selector (SFS) - Wrapper

# Function to run SFS
def run_sfs(X, y_encoded):
    global sfs_selector, X_sfs
    sfs_selector = SequentialFeatureSelector(estimator=SVR(kernel="linear"),n_features_to_select=5, direction='forward', n_jobs=-1)
    X_sfs = sfs_selector.fit_transform(X, y_encoded)

# Encode the target variable (Y Still contain String)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running SFS
sfs_thread = threading.Thread(target=run_sfs, args=(X, y_encoded))

# Start the thread
sfs_thread.start()

# Wait for the thread to complete or timeout
sfs_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if sfs_thread.is_alive():
    print("SFS did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print selected features
    sfs_top_features = np.where(sfs_selector.get_support())[0]
    print_selected_features("SFS", sfs_top_features)

SFS did not complete within the time limit of 10 minutes. Terminating...


In [17]:
# 7. Genetic Algorithm (DEAP) - Wrapper (Running to Long)
#def evaluate(individual):
#    """ Evaluation function for genetic algorithm """
#    selected_features = [i for i, value in enumerate(individual) if value > 0]
#    if len(selected_features) == 0:
#        return 1000,  # Penalty for selecting no features
#    estimator = RandomForestClassifier()
#    X_selected = X.iloc[:, selected_features]
#    estimator.fit(X_selected, y)
#    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
#creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", np.random.randint, 0, 2)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("evaluate", evaluate)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)

#population = toolbox.population(n=50)
#algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)
#best_individual = tools.selBest(population, k=1)[0]
#genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
#print_selected_features("Genetic Algorithm", genetic_selected_features)


In [18]:
# 7. Genetic Algorithm (DEAP) - Wrapper
def evaluate(individual):
    """Evaluation function for genetic algorithm"""
    selected_features = [i for i, value in enumerate(individual) if value > 0]
    if len(selected_features) == 0:
        return 1000,  # Penalty for selecting no features
    estimator = RandomForestClassifier()
    X_selected = X.iloc[:, selected_features]
    estimator.fit(X_selected, y)
    return 1 - estimator.score(X_selected, y),

# Genetic Algorithm setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Function to run Genetic Algorithm (GA)
def run_ga():
    global population
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=False)

# Set a timeout duration (in seconds)
timeout_duration = 600  # 600 seconds = 10 minutes

# Create a thread for running the GA
ga_thread = threading.Thread(target=run_ga)

# Start the thread
ga_thread.start()

# Wait for the thread to complete or timeout
ga_thread.join(timeout=timeout_duration)

# Check if the thread is still active
if ga_thread.is_alive():
    print("Genetic Algorithm did not complete within the time limit of 10 minutes. Terminating...")
    # Optionally, raise an error or take other actions
else:
    # If the thread finished in time, print the selected features
    best_individual = tools.selBest(population, k=1)[0]
    genetic_selected_features = [i for i, value in enumerate(best_individual) if value > 0]
    print_selected_features("Genetic Algorithm", genetic_selected_features)

Exception in thread Thread-9 (run_ga):
Traceback (most recent call last):
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Muhammad Raihan\AppData\Local\Temp\ipykernel_29932\3259204369.py", line 28, in run_ga
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\deap\algorithms.py", line 151, in eaSimple
    for ind, fit in zip(invalid_ind, fitnesses):
  File "C:\Users\Muhammad Raihan\AppData\Local\Temp\ipykernel_29932\3259204369.py", line 9, in evaluate
  File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\U


Genetic Algorithm Selected Features:
1. Feature proto
2. Feature state


In [19]:
# 8. Boruta Algorithm (Wrapper)
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X, y)
boruta_selected_features = np.where(boruta_selector.support_)[0]
print_selected_features("Boruta", boruta_selected_features)

ValueError: Please check your X and y variable. The provided estimator cannot be fitted to your data.
Unable to allocate 73.4 MiB for an array with shape (9614377,) and data type int64

In [20]:
# could not convert string to float for Lasso, Ridge, and Elastic Net so need to change the y variable
# Assuming y is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 

In [21]:
# 9. Lasso (L1 Regularization) - Embedded
lasso = Lasso(alpha=0.01)
lasso.fit(X, y_encoded)
lasso_support = np.where(lasso.coef_ != 0)[0]
print_selected_features("Lasso", lasso_support)


Lasso Selected Features:
1. Feature dur
2. Feature proto
3. Feature dir
4. Feature state
5. Feature stos
6. Feature dtos


In [22]:
# 10. Ridge (L2 Regularization) - Embedded
ridge = Ridge(alpha=1.0)
ridge.fit(X, y_encoded)
ridge_support = np.where(ridge.coef_ != 0)[0]
print_selected_features("Ridge", ridge_support)


Ridge Selected Features:
1. Feature dur
2. Feature proto
3. Feature dir
4. Feature state
5. Feature stos
6. Feature dtos


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [23]:
# 11. Elastic Net - Embedded
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X, y_encoded)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]
print_selected_features("Elastic Net", elastic_net_support)


Elastic Net Selected Features:
1. Feature dur
2. Feature proto
3. Feature state
4. Feature stos
5. Feature dtos


In [24]:
# 12. Random Forest Feature Importance - Embedded
rf_model = RandomForestClassifier().fit(X, y)
rf_importances = rf_model.feature_importances_
rf_top_features = np.argsort(rf_importances)[::-1][:5]
print_selected_features("Random Forest", rf_top_features, rf_importances)

MemoryError: could not allocate 11014242304 bytes

In [25]:
# 13. ExtraTreesClassifier - Embedded (Already used in your code)
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print_selected_features("ExtraTreesClassifier", index, extratrees.feature_importances_)

MemoryError: could not allocate 5507121152 bytes

In [26]:
# Compare all selected features
print("\nSummary of Selected Features for Each Method:")
for method, selected in features.items():
    print(f"{method}: {selected}")


Summary of Selected Features for Each Method:
Chi-Square: ['dur', 'proto', 'dir', 'stos', 'dtos']
ANOVA: ['dur', 'proto', 'stos', 'state', 'dir']
Mutual Information: ['stos', 'dtos', 'dur', 'state', 'proto']
Variance Threshold: ['dur', 'state', 'stos', 'dtos']
Genetic Algorithm: ['proto', 'state']
Lasso: ['dur', 'proto', 'dir', 'state', 'stos', 'dtos']
Ridge: ['dur', 'proto', 'dir', 'state', 'stos', 'dtos']
Elastic Net: ['dur', 'proto', 'state', 'stos', 'dtos']
