In [1]:
# Library untuk pengolahan data dan visualisasi
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib
import threading

# Library untuk evaluasi dan model machine learning
from sklearn.metrics import confusion_matrix, classification_report
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Library untuk Explainable AI (XAI)
from lime.lime_tabular import LimeTabularExplainer
import shap

# Library untuk Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [2]:
# Input Dataset 

DM = pd.read_csv("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CIC-PDFMal2022\\PDFMalware2022.csv") #DM--> Dataset Malware

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10026 entries, 0 to 10025
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fine name         10026 non-null  object 
 1   pdfsize           10025 non-null  float64
 2   metadata size     10025 non-null  float64
 3   pages             10025 non-null  float64
 4   xref Length       10025 non-null  float64
 5   title characters  10025 non-null  float64
 6   isEncrypted       10025 non-null  float64
 7   embedded files    10025 non-null  float64
 8   images            10025 non-null  object 
 9   text              10025 non-null  object 
 10  header            10025 non-null  object 
 11  obj               10023 non-null  object 
 12  endobj            10023 non-null  object 
 13  stream            10023 non-null  float64
 14  endstream         10023 non-null  object 
 15  xref              10023 non-null  object 
 16  trailer           10023 non-null  float6

In [4]:
#Feature Selection
features_to_drop = ['Fine name', 'images', 'text', 'header', 'obj', 'endobj', 'endstream', 
                    'xref', 'startxref', 'pageno', 'JS', 'Javascript', 'AA', 'OpenAction', 
                    'Acroform', 'JBIG2Decode', 'RichMedia', 'launch', 'EmbeddedFile', 'XFA', 'Class']

# Droping specified columns and target variable
X = DM.drop(features_to_drop, axis=1).values    
y = DM['Class'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Applying Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Encode target variable 'Class' (y)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
# Ridge (L2 Regularization) for feature selection
ridge = Ridge(alpha=1.0)  # You can adjust alpha for regularization strength
ridge.fit(X_scaled, y_encoded)

In [9]:
# Selecting non-zero coefficients
ridge_support = np.where(ridge.coef_ != 0)[0]

# Retrieve column names for the selected features
filtered_columns = DM.drop(features_to_drop, axis=1).columns
features = filtered_columns[ridge_support]

# Print the selected features with their coefficients
print("Selected features using Ridge Regularization:")
for idx in ridge_support:
    print(f"Feature {filtered_columns[idx]} with Ridge coefficient {ridge.coef_[idx]}")

Selected features using Ridge Regularization:
Feature pdfsize with Ridge coefficient -0.44580149302838834
Feature metadata size with Ridge coefficient -0.019655417944453923
Feature pages with Ridge coefficient -1.5096667813331572
Feature xref Length with Ridge coefficient 0.7496598910060105
Feature title characters with Ridge coefficient 0.5414249028900608
Feature isEncrypted with Ridge coefficient -2.0232214680081846
Feature embedded files with Ridge coefficient 0.9161762638093326
Feature stream with Ridge coefficient -3.7746196697138132
Feature trailer with Ridge coefficient -2.496303074308347
Feature encrypt with Ridge coefficient -0.3937919691529409
Feature ObjStm with Ridge coefficient -1.5048677557270038
Feature Colors with Ridge coefficient 0.4432615779225491


In [10]:
# Memilih 100% data secara acak dari setiap fitur/column
sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)
hasil_threshold = []

  sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)


In [11]:
# Pisahkan data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [12]:
# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [13]:
# Inisialisasi variabel untuk menyimpan hasil evaluasi
hasil_ml_dl = []
hasil_ml_dl_xai = []

# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fungsi untuk mengevaluasi model ML/DL
def EvaluateModel(model_name, model, X_train, y_train, X_test, y_test, use_xai=False, is_dl_model=False):
    start_time = time.time()

    # Melatih model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0) if is_dl_model else model.fit(X_train, y_train)
    
    if is_dl_model:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    elif hasattr(model, 'predict_proba'):
        # Model dengan metode predict_proba
        y_pred_proba = model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 1:  # Model klasifikasi multi-kelas
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:  # Model klasifikasi biner
            y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    else:
        # Model tanpa metode predict_proba
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()

    # Menghitung confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Menghitung metrik
    Precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    Recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    F1Score = 2 * ((Precision * Recall) / (Precision + Recall)) if (Precision + Recall) != 0 else 0
    Accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) != 0 else 0

    # Menghitung waktu running
    run_time = time.time() - start_time

    # Jika XAI diperlukan, tambahkan analisis dengan SHAP
    if use_xai:
        # Periksa apakah X_train adalah DataFrame
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]
        
        # Perbaiki format X_train untuk SHAP
        if is_dl_model:
            X_train_for_xai = X_train.reshape((X_train.shape[0], X_train.shape[1]))
            X_test_for_xai = X_test.reshape((X_test.shape[0], X_test.shape[1]))
        else:
            X_train_for_xai = X_train
            X_test_for_xai = X_test

        # Gunakan SHAP
        try:
            if is_dl_model:
                explainer = shap.DeepExplainer(model, X_train_for_xai)
                shap_values = explainer.shap_values(X_test_for_xai)
                shap_summary = np.mean(shap_values[0], axis=0)
            else:
                explainer = shap.Explainer(model.predict_proba, X_train_for_xai)
                shap_values = explainer(X_test_for_xai)
                shap_summary = shap_values.values.mean(axis=0)
        except Exception as e:
            print(f"Error using SHAP with {model_name}: {e}")
            shap_summary = None

        # Simpan hasil evaluasi dengan XAI
        hasil_ml_dl_xai.append([model_name, Precision, Recall, F1Score, Accuracy, run_time, {'SHAP': shap_summary}])
    else:
        # Simpan hasil evaluasi tanpa XAI
        hasil_ml_dl.append([model_name, Precision, Recall, F1Score, Accuracy, run_time])

# Model ML dan DL yang akan dievaluasi
model_ml_dl = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
    "Stochastic Gradient Descent": SGDClassifier(loss='log_loss', random_state=42),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(n_estimators=100),
    "LightGBM": LGBMClassifier(n_estimators=100),
    "CatBoost": CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')
}

model_dl = {
    "DNN": Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "CNN": Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "RNN": Sequential([
        LSTM(100, input_shape=(X_train.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ]),
}

# Pastikan X_train dan X_test memiliki bentuk yang sesuai untuk DL
X_train_dl = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Mengevaluasi model ML tanpa XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=False)

# Mengevaluasi model DL tanpa XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=False, is_dl_model=True)

# Mengevaluasi model ML dengan XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=True)

# Mengevaluasi model DL dengan XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=True, is_dl_model=True)

# Print hasil evaluasi tanpa XAI
print("\nHasil Evaluasi ML/DL tanpa XAI:")
print(hasil_ml_dl)

# Print hasil evaluasi dengan XAI
print("\nHasil Evaluasi ML/DL dengan XAI:")
print(hasil_ml_dl_xai)

# Convert results to DataFrame and save to CSV
df_ml_dl = pd.DataFrame(hasil_ml_dl, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime"])
df_ml_dl_xai = pd.DataFrame(hasil_ml_dl_xai, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime", "XAI"])

df_ml_dl.to_csv("hasil_evaluasi_ml_dl_RFC.csv", index=False)
df_ml_dl_xai.to_csv("hasil_evaluasi_ml_dl_xai_RFC.csv", index=False)

[LightGBM] [Info] Number of positive: 3865, number of negative: 3151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1186
[LightGBM] [Info] Number of data points in the train set: 7016, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550884 -> initscore=0.204242
[LightGBM] [Info] Start training from score 0.204242
0:	learn: 0.6233952	total: 137ms	remaining: 13.6s
1:	learn: 0.5667828	total: 139ms	remaining: 6.83s
2:	learn: 0.5158579	total: 141ms	remaining: 4.57s
3:	learn: 0.4701687	total: 143ms	remaining: 3.44s
4:	learn: 0.4331439	total: 145ms	remaining: 2.76s
5:	learn: 0.4001143	total: 147ms	remaining: 2.31s
6:	learn: 0.3720034	total: 149ms	remaining: 1.98s
7:	learn: 0.3466359	total: 151ms	remaining: 1.74s
8:	learn: 0.3234386	total: 154ms	remaining: 1.5

PermutationExplainer explainer: 3008it [00:40, 66.50it/s]                                                              
PermutationExplainer explainer: 3008it [19:14,  2.56it/s]                                                              
PermutationExplainer explainer: 3008it [03:08, 15.46it/s]                                                              
PermutationExplainer explainer: 3008it [03:22, 14.18it/s]                                                              
PermutationExplainer explainer: 3008it [02:31, 18.66it/s]                                                              
PermutationExplainer explainer: 3008it [03:12, 14.78it/s]                                                              
PermutationExplainer explainer: 3008it [1:34:43,  1.89s/it]                                                            
PermutationExplainer explainer: 3008it [04:01, 11.42it/s]                                                              
PermutationExplainer explainer: 3008it [

[LightGBM] [Info] Number of positive: 3865, number of negative: 3151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1186
[LightGBM] [Info] Number of data points in the train set: 7016, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550884 -> initscore=0.204242
[LightGBM] [Info] Start training from score 0.204242


PermutationExplainer explainer: 3008it [02:17, 20.22it/s]                                                              


0:	learn: 0.6233952	total: 1.73ms	remaining: 172ms
1:	learn: 0.5667828	total: 3.21ms	remaining: 157ms
2:	learn: 0.5158579	total: 4.72ms	remaining: 153ms
3:	learn: 0.4701687	total: 6.27ms	remaining: 150ms
4:	learn: 0.4331439	total: 7.94ms	remaining: 151ms
5:	learn: 0.4001143	total: 9.45ms	remaining: 148ms
6:	learn: 0.3720034	total: 11ms	remaining: 146ms
7:	learn: 0.3466359	total: 12.5ms	remaining: 144ms
8:	learn: 0.3234386	total: 14.1ms	remaining: 143ms
9:	learn: 0.3037145	total: 15.7ms	remaining: 141ms
10:	learn: 0.2849609	total: 17.5ms	remaining: 141ms
11:	learn: 0.2686569	total: 19.1ms	remaining: 140ms
12:	learn: 0.2537131	total: 20.7ms	remaining: 138ms
13:	learn: 0.2402928	total: 22.2ms	remaining: 137ms
14:	learn: 0.2283432	total: 23.8ms	remaining: 135ms
15:	learn: 0.2181887	total: 25.2ms	remaining: 132ms
16:	learn: 0.2071913	total: 26.8ms	remaining: 131ms
17:	learn: 0.1986340	total: 28.5ms	remaining: 130ms
18:	learn: 0.1898069	total: 30ms	remaining: 128ms
19:	learn: 0.1823055	total

PermutationExplainer explainer: 3008it [05:07,  9.47it/s]                                                              










Error using SHAP with CNN: operands could not be broadcast together with shapes (7016,12,1) (7016,12) 




Error using SHAP with RNN: in user code:

    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 244, in grad_graph  *
        out = self.model(shap_rAnD)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 371, in custom_grad
        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 663, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\P

