In [1]:
# Library
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from lime.lime_tabular import LimeTabularExplainer
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from keras.optimizers import Adam

In [2]:
# Input Dataset 

DM = pd.read_parquet("C:\\Data Raihan\\Penelitian Threshold\\Dataset\\CCCS-CIC-AndMal-2020\\cicandmal2020-dynamic.parquet")#DM--> Dataset Malware

In [3]:
DM.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53439 entries, 0 to 53438
Data columns (total 145 columns):
 #    Column                                                                              Non-Null Count  Dtype 
---   ------                                                                              --------------  ----- 
 0    Memory_PssTotal                                                                     53439 non-null  int32 
 1    Memory_PssClean                                                                     53439 non-null  int32 
 2    Memory_SharedDirty                                                                  53439 non-null  int32 
 3    Memory_PrivateDirty                                                                 53439 non-null  int32 
 4    Memory_SharedClean                                                                  53439 non-null  int32 
 5    Memory_PrivateClean                                                                 53439 non

In [4]:
# Menampilkan nilai unik pada kolom Category, Family, dan Label
unique_category = DM['Category'].unique()
unique_family = DM['Family'].unique()
unique_label = DM['Label'].unique()

print("Unique Categories:", unique_category)
print("Unique Families:", unique_family)
print("Unique Labels:", unique_label)

Unique Categories: ['Trojan_Spy' 'FileInfector' 'Zero_Day' 'Backdoor' 'Ransomware'
 'No_Category' 'Trojan_SMS' 'Trojan_Dropper' 'Trojan_Banker' 'Scareware'
 'PUA' 'Adware' 'Riskware' 'Trojan']
Unique Families: ['smsthief' 'sandr' 'smforw' ... 'pesabti' 'mycompany' 'koomer']
Unique Labels: ['Trojan_Spy_before_reboot_Cat' 'FileInfector_after_reboot_Cat'
 'Zero_Day_after_reboot_Cat' 'Trojan_Spy_after_reboot_Cat'
 'Backdoor_before_reboot_Cat' 'Ransomware_after_reboot_Cat'
 'Ransomware_before_reboot_Cat' 'No_Category_before_reboot_Cat'
 'Trojan_SMS_after_reboot_Cat' 'Zero_Day_before_reboot_Cat'
 'Backdoor_after_reboot_Cat' 'Trojan_Dropper_after_reboot_Cat'
 'Trojan_Banker_after_reboot_Cat' 'Scareware_before_reboot_Cat'
 'PUA_after_reboot_Cat' 'Scareware_after_reboot_Cat'
 'PUA_before_reboot_Cat' 'Adware_before_reboot_Cat'
 'Riskware_before_reboot_Cat' 'Riskware_after_reboot_Cat'
 'Trojan_Banker_before_reboot_Cat' 'Trojan_SMS_before_reboot_Cat'
 'FileInfector_before_reboot_Cat' 'Trojan_befor

In [5]:
# Definisikan kategori benign dan malware
benign_categories = ['Adware', 'Riskware', 'PUA', 'No_Category']
malware_categories = ['Trojan_Spy', 'FileInfector', 'Zero_Day', 'Backdoor', 
                      'Ransomware', 'Trojan_SMS', 'Trojan_Dropper', 'Trojan_Banker', 
                      'Scareware', 'Trojan']

# Relabeling ke dalam kelas binari: 'Benign' dan 'Malware'
DM['Binary_Label'] = DM['Category'].apply(lambda x: 'Benign' if x in benign_categories else 'Malware')

# Cek hasil unique Binary_Label setelah relabeling
unique_binary_labels = DM['Binary_Label'].unique()
print("Unique Binary Labels after relabeling:", unique_binary_labels)

Unique Binary Labels after relabeling: ['Malware' 'Benign']


In [6]:
#Feature Selection
features = DM.drop(['Hash', 'Category', 'Family', 'Label', 'Network_TotalReceivedBytes', 'Network_TotalReceivedPackets', 'Binary_Label'],axis=1).columns.tolist()
# Target variable
y = DM['Binary_Label'].values

In [7]:
# Tentukan 1% dari total dataset
total_samples = int(len(DM) * 0.01)

# Ambil sampel secara acak dari kedua kelas dengan jumlah yang seimbang
sampled_data = DM.groupby('Binary_Label').apply(lambda x: x.sample(n=int(total_samples / 2), random_state=42)).reset_index(drop=True)

# Cek distribusi kelas setelah sampling
print(sampled_data['Binary_Label'].value_counts())

Binary_Label
Benign     267
Malware    267
Name: count, dtype: int64


  sampled_data = DM.groupby('Binary_Label').apply(lambda x: x.sample(n=int(total_samples / 2), random_state=42)).reset_index(drop=True)


In [8]:
for i, column_name in enumerate(features):
    print(f"{i+1:3} {column_name:40} {sampled_data[column_name].count():<15} {sampled_data[column_name].dtype}")

  1 Memory_PssTotal                          534             int32
  2 Memory_PssClean                          534             int32
  3 Memory_SharedDirty                       534             int32
  4 Memory_PrivateDirty                      534             int32
  5 Memory_SharedClean                       534             int32
  6 Memory_PrivateClean                      534             int32
  7 Memory_SwapPssDirty                      534             int8
  8 Memory_HeapSize                          534             int32
  9 Memory_HeapAlloc                         534             int32
 10 Memory_HeapFree                          534             int32
 11 Memory_Views                             534             int16
 12 Memory_ViewRootImpl                      534             int16
 13 Memory_AppContexts                       534             int16
 14 Memory_Activities                        534             int16
 15 Memory_Assets                            534             in

In [9]:
# Pisahkan data menjadi fitur (X) dan target (y)
X = sampled_data[features]
y = sampled_data['Binary_Label']

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Membuat imputer untuk menggantikan NaN dengan rata-rata
imputer = SimpleImputer(strategy='mean')

# Menangani missing values pada X_train dan X_test
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [11]:
# Gabungkan semua label
all_labels = np.concatenate([y_train, y_test])

# Inisialisasi LabelEncoder
label_encoder = LabelEncoder()

# Fit encoder pada gabungan label
label_encoder.fit(all_labels)

# Transform data
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Cek label yang ada di y_train dan y_test
train_labels = set(label_encoder.classes_)
test_labels = set(y_test)

# Menampilkan label yang tidak ada di y_train
missing_labels = test_labels - train_labels
if missing_labels:
    print(f"Labels in y_test that are not in y_train: {missing_labels}")
else:
    print("All labels in y_test are present in y_train.")

All labels in y_test are present in y_train.


In [12]:
# Inisialisasi variabel untuk menyimpan hasil evaluasi
hasil_ml_dl = []
hasil_ml_dl_xai = []

# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fungsi untuk mengevaluasi model ML/DL
def EvaluateModel(model_name, model, X_train, y_train, X_test, y_test, use_xai=False, is_dl_model=False):
    start_time = time.time()

    # Melatih model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0) if is_dl_model else model.fit(X_train, y_train)
    
    if is_dl_model:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    elif hasattr(model, 'predict_proba'):
        # Model dengan metode predict_proba
        y_pred_proba = model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 1:  # Model klasifikasi multi-kelas
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:  # Model klasifikasi biner
            y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    else:
        # Model tanpa metode predict_proba
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()

    # Menghitung confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Menghitung metrik
    Precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    Recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    F1Score = 2 * ((Precision * Recall) / (Precision + Recall)) if (Precision + Recall) != 0 else 0
    Accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) != 0 else 0

    # Menghitung waktu running
    run_time = time.time() - start_time

    # Jika XAI diperlukan, tambahkan analisis dengan SHAP
    if use_xai:
        # Periksa apakah X_train adalah DataFrame
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]
        
        # Perbaiki format X_train untuk SHAP
        if is_dl_model:
            X_train_for_xai = X_train.reshape((X_train.shape[0], X_train.shape[1]))
            X_test_for_xai = X_test.reshape((X_test.shape[0], X_test.shape[1]))
        else:
            X_train_for_xai = X_train
            X_test_for_xai = X_test

        # Gunakan SHAP
        try:
            if is_dl_model:
                explainer = shap.DeepExplainer(model, X_train_for_xai)
                shap_values = explainer.shap_values(X_test_for_xai)
                shap_summary = np.mean(shap_values[0], axis=0)
            else:
                explainer = shap.Explainer(model.predict_proba, X_train_for_xai)
                shap_values = explainer(X_test_for_xai)
                shap_summary = shap_values.values.mean(axis=0)
        except Exception as e:
            print(f"Error using SHAP with {model_name}: {e}")
            shap_summary = None

        # Simpan hasil evaluasi dengan XAI
        hasil_ml_dl_xai.append([model_name, Precision, Recall, F1Score, Accuracy, run_time, {'SHAP': shap_summary}])
    else:
        # Simpan hasil evaluasi tanpa XAI
        hasil_ml_dl.append([model_name, Precision, Recall, F1Score, Accuracy, run_time])

# Model ML dan DL yang akan dievaluasi
model_ml_dl = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
    "Stochastic Gradient Descent": SGDClassifier(loss='log_loss', random_state=42),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(n_estimators=100),
    "LightGBM": LGBMClassifier(n_estimators=100),
    "CatBoost": CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')
}

model_dl = {
    "DNN": Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "CNN": Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "RNN": Sequential([
        LSTM(100, input_shape=(X_train.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ]),
}

# Pastikan X_train dan X_test memiliki bentuk yang sesuai untuk DL
X_train_dl = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Mengevaluasi model ML tanpa XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=False)

# Mengevaluasi model DL tanpa XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=False, is_dl_model=True)

# Mengevaluasi model ML dengan XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=True)

# Mengevaluasi model DL dengan XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=True, is_dl_model=True)

# Print hasil evaluasi tanpa XAI
print("\nHasil Evaluasi ML/DL tanpa XAI:")
print(hasil_ml_dl)

# Print hasil evaluasi dengan XAI
print("\nHasil Evaluasi ML/DL dengan XAI:")
print(hasil_ml_dl_xai)

# Convert results to DataFrame and save to CSV
df_ml_dl = pd.DataFrame(hasil_ml_dl, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime"])
df_ml_dl_xai = pd.DataFrame(hasil_ml_dl_xai, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime", "XAI"])

df_ml_dl.to_csv("hasil_evaluasi_ml_dl_BF.csv", index=False)
df_ml_dl_xai.to_csv("hasil_evaluasi_ml_dl_xai_BF.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 185, number of negative: 188
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3541
[LightGBM] [Info] Number of data points in the train set: 373, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495979 -> initscore=-0.016086
[LightGBM] [Info] Start training from score -0.016086
0:	learn: 0.6564064	total: 137ms	remaining: 13.6s
1:	learn: 0.6222528	total: 140ms	remaining: 6.84s
2:	learn: 0.5987228	total: 142ms	remaining: 4.6s
3:	learn: 0.5758995	total: 145ms	remaining: 3.48s
4:	learn: 0.5524841	total: 148ms	remaining: 2.81s
5:	learn: 0.5334939	total: 151ms	remaining: 2.36s
6:	learn: 0.5035752	total: 154ms	remaining: 2.04s
7:	learn: 0.4904400	total: 157ms	remaining: 1.81s
8:	learn: 0.4793135	total: 161ms	remaining: 1.63s

PermutationExplainer explainer: 162it [00:11,  4.46it/s]                                                               
PermutationExplainer explainer: 162it [00:20,  3.88it/s]                                                               
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
PermutationExplainer explainer: 162it [00:13,  3.20it/s]                                                               
PermutationExplainer explainer: 162it [03:11,  1.25s/it]                                                               
PermutationExplainer explainer: 162it [00:11,  1.36it/s]                                                               
PermutationExplainer expl

[LightGBM] [Info] Number of positive: 185, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3541
[LightGBM] [Info] Number of data points in the train set: 373, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495979 -> initscore=-0.016086
[LightGBM] [Info] Start training from score -0.016086


PermutationExplainer explainer: 162it [00:10,  5.04s/it]                                                               


0:	learn: 0.6564064	total: 3.52ms	remaining: 348ms
1:	learn: 0.6222528	total: 6.63ms	remaining: 325ms
2:	learn: 0.5987228	total: 9.78ms	remaining: 316ms
3:	learn: 0.5758995	total: 12.7ms	remaining: 305ms
4:	learn: 0.5524841	total: 15.6ms	remaining: 296ms
5:	learn: 0.5334939	total: 18.5ms	remaining: 290ms
6:	learn: 0.5035752	total: 21.6ms	remaining: 287ms
7:	learn: 0.4904400	total: 24.8ms	remaining: 285ms
8:	learn: 0.4793135	total: 27.6ms	remaining: 279ms
9:	learn: 0.4627931	total: 30.5ms	remaining: 275ms
10:	learn: 0.4476481	total: 33.4ms	remaining: 271ms
11:	learn: 0.4348368	total: 36.5ms	remaining: 267ms
12:	learn: 0.4226281	total: 39.6ms	remaining: 265ms
13:	learn: 0.4117943	total: 42.8ms	remaining: 263ms
14:	learn: 0.3995265	total: 45.7ms	remaining: 259ms
15:	learn: 0.3903862	total: 48.8ms	remaining: 256ms
16:	learn: 0.3809009	total: 51.7ms	remaining: 253ms
17:	learn: 0.3731086	total: 54.8ms	remaining: 250ms
18:	learn: 0.3658266	total: 58ms	remaining: 247ms
19:	learn: 0.3574337	tot

PermutationExplainer explainer: 162it [02:59,  1.18s/it]                                                               










Error using SHAP with CNN: operands could not be broadcast together with shapes (373,139,1) (373,139) 




Error using SHAP with RNN: in user code:

    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 244, in grad_graph  *
        out = self.model(shap_rAnD)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 371, in custom_grad
        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 663, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\P