In [1]:
# Library untuk pengolahan data dan visualisasi
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib
import threading

# Library untuk evaluasi dan model machine learning
from sklearn.metrics import confusion_matrix, classification_report
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Library untuk Explainable AI (XAI)
from lime.lime_tabular import LimeTabularExplainer
import shap

# Library untuk Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [2]:
# Input Dataset 

DM = pd.read_csv("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\Obfuscated-MalMem2022\\Obfuscated-MalMem2022.csv") #DM--> Dataset Malware

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58596 entries, 0 to 58595
Data columns (total 57 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Category                                58596 non-null  object 
 1   pslist.nproc                            58596 non-null  int64  
 2   pslist.nppid                            58596 non-null  int64  
 3   pslist.avg_threads                      58596 non-null  float64
 4   pslist.nprocs64bit                      58596 non-null  int64  
 5   pslist.avg_handlers                     58596 non-null  float64
 6   dlllist.ndlls                           58596 non-null  int64  
 7   dlllist.avg_dlls_per_proc               58596 non-null  float64
 8   handles.nhandles                        58596 non-null  int64  
 9   handles.avg_handles_per_proc            58596 non-null  float64
 10  handles.nport                           58596 non-null  in

In [4]:
#Feature Selection
features_to_drop = ['Category','Class']

# Droping specified columns and target variable
X = DM.drop(features_to_drop, axis=1).values    
y = DM['Class'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Encoding y dengan LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
# Elastic Net - Embedded
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.05)
elastic_net.fit(X_scaled, y_encoded)

In [9]:
# Mendapatkan support dari Elastic Net (fitur yang koefisiennya tidak 0)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]

# Mengambil nama kolom dari X yang sudah difilter
filtered_columns = DM.drop(features_to_drop, axis=1).columns

features = []
for idx in elastic_net_support:
    print(f"Feature {filtered_columns[idx]} dengan koefisien Elastic Net {elastic_net.coef_[idx]}")
    features.append(filtered_columns[idx])

Feature pslist.nppid dengan koefisien Elastic Net 0.17498088672755707
Feature pslist.avg_threads dengan koefisien Elastic Net -0.6537626778387375
Feature dlllist.ndlls dengan koefisien Elastic Net -0.3439927150200938
Feature dlllist.avg_dlls_per_proc dengan koefisien Elastic Net -0.7731545306615779
Feature handles.nevent dengan koefisien Elastic Net -0.48237443495727395
Feature handles.nkey dengan koefisien Elastic Net -0.1394865627938468
Feature handles.nthread dengan koefisien Elastic Net -0.14978458579346252
Feature handles.ndirectory dengan koefisien Elastic Net 0.014651386189496733
Feature handles.nsemaphore dengan koefisien Elastic Net -0.007628513164729904
Feature handles.ntimer dengan koefisien Elastic Net -0.10513633168966578
Feature handles.nsection dengan koefisien Elastic Net -0.004146597769118551
Feature handles.nmutant dengan koefisien Elastic Net -0.5636310023356788
Feature ldrmodules.not_in_load dengan koefisien Elastic Net -0.28261254182171486
Feature ldrmodules.not_in

In [10]:
# Memilih 100% data secara acak dari setiap fitur/column
sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

  sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)


In [11]:
# Pisahkan data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [12]:
# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [13]:
# Inisialisasi variabel untuk menyimpan hasil evaluasi
hasil_ml_dl = []
hasil_ml_dl_xai = []

# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fungsi untuk mengevaluasi model ML/DL
def EvaluateModel(model_name, model, X_train, y_train, X_test, y_test, use_xai=False, is_dl_model=False):
    start_time = time.time()

    # Melatih model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0) if is_dl_model else model.fit(X_train, y_train)
    
    if is_dl_model:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    elif hasattr(model, 'predict_proba'):
        # Model dengan metode predict_proba
        y_pred_proba = model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 1:  # Model klasifikasi multi-kelas
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:  # Model klasifikasi biner
            y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    else:
        # Model tanpa metode predict_proba
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()

    # Menghitung confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Menghitung metrik
    Precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    Recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    F1Score = 2 * ((Precision * Recall) / (Precision + Recall)) if (Precision + Recall) != 0 else 0
    Accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) != 0 else 0

    # Menghitung waktu running
    run_time = time.time() - start_time

    # Jika XAI diperlukan, tambahkan analisis dengan SHAP
    if use_xai:
        # Periksa apakah X_train adalah DataFrame
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]
        
        # Perbaiki format X_train untuk SHAP
        if is_dl_model:
            X_train_for_xai = X_train.reshape((X_train.shape[0], X_train.shape[1]))
            X_test_for_xai = X_test.reshape((X_test.shape[0], X_test.shape[1]))
        else:
            X_train_for_xai = X_train
            X_test_for_xai = X_test

        # Gunakan SHAP
        try:
            if is_dl_model:
                explainer = shap.DeepExplainer(model, X_train_for_xai)
                shap_values = explainer.shap_values(X_test_for_xai)
                shap_summary = np.mean(shap_values[0], axis=0)
            else:
                explainer = shap.Explainer(model.predict_proba, X_train_for_xai)
                shap_values = explainer(X_test_for_xai)
                shap_summary = shap_values.values.mean(axis=0)
        except Exception as e:
            print(f"Error using SHAP with {model_name}: {e}")
            shap_summary = None

        # Simpan hasil evaluasi dengan XAI
        hasil_ml_dl_xai.append([model_name, Precision, Recall, F1Score, Accuracy, run_time, {'SHAP': shap_summary}])
    else:
        # Simpan hasil evaluasi tanpa XAI
        hasil_ml_dl.append([model_name, Precision, Recall, F1Score, Accuracy, run_time])

# Model ML dan DL yang akan dievaluasi
model_ml_dl = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
    "Stochastic Gradient Descent": SGDClassifier(loss='log_loss', random_state=42),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(n_estimators=100),
    "LightGBM": LGBMClassifier(n_estimators=100),
    "CatBoost": CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')
}

model_dl = {
    "DNN": Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "CNN": Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "RNN": Sequential([
        LSTM(100, input_shape=(X_train.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ]),
}

# Pastikan X_train dan X_test memiliki bentuk yang sesuai untuk DL
X_train_dl = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Mengevaluasi model ML tanpa XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=False)

# Mengevaluasi model DL tanpa XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=False, is_dl_model=True)

# Mengevaluasi model ML dengan XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=True)

# Mengevaluasi model DL dengan XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=True, is_dl_model=True)

# Print hasil evaluasi tanpa XAI
print("\nHasil Evaluasi ML/DL tanpa XAI:")
print(hasil_ml_dl)

# Print hasil evaluasi dengan XAI
print("\nHasil Evaluasi ML/DL dengan XAI:")
print(hasil_ml_dl_xai)

# Convert results to DataFrame and save to CSV
df_ml_dl = pd.DataFrame(hasil_ml_dl, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime"])
df_ml_dl_xai = pd.DataFrame(hasil_ml_dl_xai, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime", "XAI"])

df_ml_dl.to_csv("hasil_evaluasi_ml_dl_ENFC.csv", index=False)
df_ml_dl_xai.to_csv("hasil_evaluasi_ml_dl_xai_ENFC.csv", index=False)

[LightGBM] [Info] Number of positive: 20469, number of negative: 20548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7597
[LightGBM] [Info] Number of data points in the train set: 41017, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499037 -> initscore=-0.003852
[LightGBM] [Info] Start training from score -0.003852
0:	learn: 0.4139860	total: 150ms	remaining: 14.9s
1:	learn: 0.2530356	total: 167ms	remaining: 8.18s
2:	learn: 0.1431572	total: 183ms	remaining: 5.92s
3:	learn: 0.0828220	total: 203ms	remaining: 4.86s
4:	learn: 0.0490886	total: 228ms	remaining: 4.33s
5:	learn: 0.0278122	total: 245ms	remaining: 3.84s
6:	learn: 0.0173225	total: 261ms	remaining: 3.46s
7:	learn: 0.0121741	total: 276ms	remaining: 3.17s
8:	learn: 0.0084377	total: 291ms	remaining: 2.94s
9:	learn: 0.0060383	total: 309ms	remaining: 2.78s
10:	le

PermutationExplainer explainer: 17580it [12:35, 23.19it/s]                                                             
PermutationExplainer explainer: 17580it [50:19,  5.80it/s]                                                             
PermutationExplainer explainer: 17580it [11:53, 24.34it/s]                                                             
PermutationExplainer explainer: 17580it [32:21,  9.01it/s]                                                             
PermutationExplainer explainer: 17580it [13:21, 21.66it/s]                                                             
PermutationExplainer explainer: 17580it [11:02, 26.14it/s]                                                             
PermutationExplainer explainer: 17580it [10:35:58,  2.17s/it]                                                          
PermutationExplainer explainer: 17580it [18:14, 15.91it/s]                                                             
PermutationExplainer explainer: 17580it 

[LightGBM] [Info] Number of positive: 20469, number of negative: 20548
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7597
[LightGBM] [Info] Number of data points in the train set: 41017, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499037 -> initscore=-0.003852
[LightGBM] [Info] Start training from score -0.003852


PermutationExplainer explainer: 17580it [20:40, 14.06it/s]                                                             


0:	learn: 0.4139860	total: 12.5ms	remaining: 1.24s
1:	learn: 0.2530356	total: 24.2ms	remaining: 1.19s
2:	learn: 0.1431572	total: 36.6ms	remaining: 1.18s
3:	learn: 0.0828220	total: 48.1ms	remaining: 1.15s
4:	learn: 0.0490886	total: 61.1ms	remaining: 1.16s
5:	learn: 0.0278122	total: 73.7ms	remaining: 1.15s
6:	learn: 0.0173225	total: 85.6ms	remaining: 1.14s
7:	learn: 0.0121741	total: 97.6ms	remaining: 1.12s
8:	learn: 0.0084377	total: 109ms	remaining: 1.1s
9:	learn: 0.0060383	total: 124ms	remaining: 1.11s
10:	learn: 0.0047036	total: 137ms	remaining: 1.11s
11:	learn: 0.0039345	total: 150ms	remaining: 1.1s
12:	learn: 0.0033389	total: 162ms	remaining: 1.08s
13:	learn: 0.0026314	total: 175ms	remaining: 1.07s
14:	learn: 0.0022482	total: 186ms	remaining: 1.05s
15:	learn: 0.0019083	total: 198ms	remaining: 1.04s
16:	learn: 0.0017279	total: 209ms	remaining: 1.02s
17:	learn: 0.0014875	total: 221ms	remaining: 1.01s
18:	learn: 0.0013509	total: 233ms	remaining: 994ms
19:	learn: 0.0011706	total: 245ms	r

PermutationExplainer explainer: 17580it [3:09:48,  1.54it/s]                                                           










Error using SHAP with CNN: operands could not be broadcast together with shapes (41017,55,1) (41017,55) 




Error using SHAP with RNN: in user code:

    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 244, in grad_graph  *
        out = self.model(shap_rAnD)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 371, in custom_grad
        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 663, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\P