In [1]:
# Library
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from lime.lime_tabular import LimeTabularExplainer
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from keras.optimizers import Adam

In [2]:
# Input Dataset 

DM = pd.read_csv("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CIC-PDFMal2022\\PDFMalware2022.csv") #DM--> Dataset Malware

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10026 entries, 0 to 10025
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fine name         10026 non-null  object 
 1   pdfsize           10025 non-null  float64
 2   metadata size     10025 non-null  float64
 3   pages             10025 non-null  float64
 4   xref Length       10025 non-null  float64
 5   title characters  10025 non-null  float64
 6   isEncrypted       10025 non-null  float64
 7   embedded files    10025 non-null  float64
 8   images            10025 non-null  object 
 9   text              10025 non-null  object 
 10  header            10025 non-null  object 
 11  obj               10023 non-null  object 
 12  endobj            10023 non-null  object 
 13  stream            10023 non-null  float64
 14  endstream         10023 non-null  object 
 15  xref              10023 non-null  object 
 16  trailer           10023 non-null  float6

In [4]:
#Feature Selection
features = DM.drop(['Fine name','images','text','header','obj','endobj','endstream','xref','startxref','pageno','JS','Javascript','AA','OpenAction','Acroform','JBIG2Decode','RichMedia','launch','EmbeddedFile','XFA','Class'],axis=1).columns.tolist()
# Target variable
y = DM['Class'].values

In [5]:
# Memilih 100% data secara acak dari setiap fitur/column
sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

  sampled_data = DM.groupby('Class').apply(lambda x: x.sample(frac=1)).reset_index(drop=True)


In [6]:
for i, column_name in enumerate(features):
    print(f"{i+1:3} {column_name:40} {sampled_data[column_name].count():<15} {sampled_data[column_name].dtype}")

  1 pdfsize                                  10025           float64
  2 metadata size                            10025           float64
  3 pages                                    10025           float64
  4 xref Length                              10025           float64
  5 title characters                         10025           float64
  6 isEncrypted                              10025           float64
  7 embedded files                           10025           float64
  8 stream                                   10023           float64
  9 trailer                                  10023           float64
 10 encrypt                                  10023           float64
 11 ObjStm                                   10023           float64
 12 Colors                                   10023           float64


In [7]:
# Pisahkan data menjadi fitur (X) dan target (y)
X = sampled_data[features]
y = sampled_data['Class']

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Membuat imputer untuk menggantikan NaN dengan rata-rata
imputer = SimpleImputer(strategy='mean')

# Menangani missing values pada X_train dan X_test
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [9]:
# Gabungkan semua label
all_labels = np.concatenate([y_train, y_test])

# Inisialisasi LabelEncoder
label_encoder = LabelEncoder()

# Fit encoder pada gabungan label
label_encoder.fit(all_labels)

# Transform data
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Cek label yang ada di y_train dan y_test
train_labels = set(label_encoder.classes_)
test_labels = set(y_test)

# Menampilkan label yang tidak ada di y_train
missing_labels = test_labels - train_labels
if missing_labels:
    print(f"Labels in y_test that are not in y_train: {missing_labels}")
else:
    print("All labels in y_test are present in y_train.")


All labels in y_test are present in y_train.


In [10]:
# Inisialisasi variabel untuk menyimpan hasil evaluasi
hasil_ml_dl = []
hasil_ml_dl_xai = []

# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fungsi untuk mengevaluasi model ML/DL
def EvaluateModel(model_name, model, X_train, y_train, X_test, y_test, use_xai=False, is_dl_model=False):
    start_time = time.time()

    # Melatih model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0) if is_dl_model else model.fit(X_train, y_train)
    
    if is_dl_model:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    elif hasattr(model, 'predict_proba'):
        # Model dengan metode predict_proba
        y_pred_proba = model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 1:  # Model klasifikasi multi-kelas
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:  # Model klasifikasi biner
            y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    else:
        # Model tanpa metode predict_proba
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()

    # Menghitung confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Menghitung metrik
    Precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    Recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    F1Score = 2 * ((Precision * Recall) / (Precision + Recall)) if (Precision + Recall) != 0 else 0
    Accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) != 0 else 0

    # Menghitung waktu running
    run_time = time.time() - start_time

    # Jika XAI diperlukan, tambahkan analisis dengan SHAP
    if use_xai:
        # Periksa apakah X_train adalah DataFrame
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]
        
        # Perbaiki format X_train untuk SHAP
        if is_dl_model:
            X_train_for_xai = X_train.reshape((X_train.shape[0], X_train.shape[1]))
            X_test_for_xai = X_test.reshape((X_test.shape[0], X_test.shape[1]))
        else:
            X_train_for_xai = X_train
            X_test_for_xai = X_test

        # Gunakan SHAP
        try:
            if is_dl_model:
                explainer = shap.DeepExplainer(model, X_train_for_xai)
                shap_values = explainer.shap_values(X_test_for_xai)
                shap_summary = np.mean(shap_values[0], axis=0)
            else:
                explainer = shap.Explainer(model.predict_proba, X_train_for_xai)
                shap_values = explainer(X_test_for_xai)
                shap_summary = shap_values.values.mean(axis=0)
        except Exception as e:
            print(f"Error using SHAP with {model_name}: {e}")
            shap_summary = None

        # Simpan hasil evaluasi dengan XAI
        hasil_ml_dl_xai.append([model_name, Precision, Recall, F1Score, Accuracy, run_time, {'SHAP': shap_summary}])
    else:
        # Simpan hasil evaluasi tanpa XAI
        hasil_ml_dl.append([model_name, Precision, Recall, F1Score, Accuracy, run_time])

# Model ML dan DL yang akan dievaluasi
model_ml_dl = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
    "Stochastic Gradient Descent": SGDClassifier(loss='log_loss', random_state=42),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(n_estimators=100),
    "LightGBM": LGBMClassifier(n_estimators=100),
    "CatBoost": CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')
}

model_dl = {
    "DNN": Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "CNN": Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "RNN": Sequential([
        LSTM(100, input_shape=(X_train.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ]),
}

# Pastikan X_train dan X_test memiliki bentuk yang sesuai untuk DL
X_train_dl = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Mengevaluasi model ML tanpa XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=False)

# Mengevaluasi model DL tanpa XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=False, is_dl_model=True)

# Mengevaluasi model ML dengan XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=True)

# Mengevaluasi model DL dengan XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=True, is_dl_model=True)

# Print hasil evaluasi tanpa XAI
print("\nHasil Evaluasi ML/DL tanpa XAI:")
print(hasil_ml_dl)

# Print hasil evaluasi dengan XAI
print("\nHasil Evaluasi ML/DL dengan XAI:")
print(hasil_ml_dl_xai)

# Convert results to DataFrame and save to CSV
df_ml_dl = pd.DataFrame(hasil_ml_dl, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime"])
df_ml_dl_xai = pd.DataFrame(hasil_ml_dl_xai, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime", "XAI"])

df_ml_dl.to_csv("hasil_evaluasi_ml_dl_BF.csv", index=False)
df_ml_dl_xai.to_csv("hasil_evaluasi_ml_dl_xai_BF.csv", index=False)


[LightGBM] [Info] Number of positive: 3917, number of negative: 3100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 7017, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.558216 -> initscore=0.233924
[LightGBM] [Info] Start training from score 0.233924
0:	learn: 0.6212770	total: 136ms	remaining: 13.4s
1:	learn: 0.5612924	total: 138ms	remaining: 6.74s
2:	learn: 0.5106660	total: 139ms	remaining: 4.5s
3:	learn: 0.4645847	total: 141ms	remaining: 3.38s
4:	learn: 0.4262830	total: 143ms	remaining: 2.71s
5:	learn: 0.3935985	total: 145ms	remaining: 2.27s
6:	learn: 0.3662782	total: 147ms	remaining: 1.95s
7:	learn: 0.3411721	total: 149ms	remaining: 1.71s
8:	learn: 0.3185328	total: 150ms	remaining: 1.52s
9:	learn: 0.2988376	total: 152ms	remaining: 1.37s
10:	learn: 0

PermutationExplainer explainer: 3009it [00:36, 70.45it/s]                                                              
PermutationExplainer explainer: 3009it [09:06,  5.39it/s]                                                              
PermutationExplainer explainer: 3009it [00:35, 61.94it/s]                                                              
PermutationExplainer explainer: 3009it [00:43, 53.50it/s]                                                              
PermutationExplainer explainer: 3009it [00:33, 62.58it/s]                                                              
PermutationExplainer explainer: 3009it [00:32, 63.75it/s]                                                              
PermutationExplainer explainer: 3009it [45:23,  1.10it/s]                                                              
PermutationExplainer explainer: 3009it [07:38,  6.43it/s]                                                              
PermutationExplainer explainer: 3009it [

[LightGBM] [Info] Number of positive: 3917, number of negative: 3100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.508404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 7017, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.558216 -> initscore=0.233924
[LightGBM] [Info] Start training from score 0.233924


PermutationExplainer explainer: 3009it [15:24,  3.24it/s]                                                              


0:	learn: 0.6212770	total: 5.63ms	remaining: 558ms
1:	learn: 0.5612924	total: 9.57ms	remaining: 469ms
2:	learn: 0.5106660	total: 13.6ms	remaining: 440ms
3:	learn: 0.4645847	total: 21.7ms	remaining: 520ms
4:	learn: 0.4262830	total: 23.7ms	remaining: 451ms
5:	learn: 0.3935985	total: 27.9ms	remaining: 437ms
6:	learn: 0.3662782	total: 31.9ms	remaining: 424ms
7:	learn: 0.3411721	total: 37ms	remaining: 425ms
8:	learn: 0.3185328	total: 39ms	remaining: 394ms
9:	learn: 0.2988376	total: 42.9ms	remaining: 386ms
10:	learn: 0.2800841	total: 49.6ms	remaining: 401ms
11:	learn: 0.2644325	total: 64ms	remaining: 469ms
12:	learn: 0.2486767	total: 74.9ms	remaining: 501ms
13:	learn: 0.2349105	total: 79.2ms	remaining: 487ms
14:	learn: 0.2233187	total: 81.4ms	remaining: 461ms
15:	learn: 0.2124956	total: 84.8ms	remaining: 445ms
16:	learn: 0.2024412	total: 88ms	remaining: 429ms
17:	learn: 0.1936623	total: 90ms	remaining: 410ms
18:	learn: 0.1852011	total: 92.4ms	remaining: 394ms
19:	learn: 0.1783514	total: 95.4

PermutationExplainer explainer: 3009it [07:11,  6.84it/s]                                                              










Error using SHAP with CNN: operands could not be broadcast together with shapes (7017,12,1) (7017,12) 




Error using SHAP with RNN: in user code:

    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 244, in grad_graph  *
        out = self.model(shap_rAnD)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 371, in custom_grad
        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 663, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\P

