In [1]:
# Library untuk pengolahan data dan visualisasi
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib
import threading

# Library untuk evaluasi dan model machine learning
from sklearn.metrics import confusion_matrix, classification_report
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Library untuk Explainable AI (XAI)
from lime.lime_tabular import LimeTabularExplainer
import shap

# Library untuk Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [2]:
# Input Dataset 

# Define the directory where the parquet files are stored
parquet_directory = "C:\Data Raihan\Penelitian Threshold\Dataset\CTU-13"

# List all parquet files in the directory
parquet_files = [f for f in os.listdir(parquet_directory) if f.endswith('.parquet')]

# Read each parquet file and append it to a list of DataFrames
dataframes = [pd.read_parquet(os.path.join(parquet_directory, file)) for file in parquet_files]

# Concatenate all DataFrames into a single DataFrame
DM = pd.concat(dataframes, ignore_index=True)

In [3]:
DM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10598771 entries, 0 to 10598770
Data columns (total 11 columns):
 #   Column     Dtype  
---  ------     -----  
 0   dur        float32
 1   proto      object 
 2   dir        object 
 3   state      object 
 4   stos       float32
 5   dtos       float32
 6   tot_pkts   int32  
 7   tot_bytes  int64  
 8   src_bytes  int64  
 9   label      object 
 10  Family     object 
dtypes: float32(3), int32(1), int64(2), object(5)
memory usage: 727.8+ MB


In [4]:
#Feature Selection
features_to_drop = ['proto', 'dir', 'state', 'dtos', 'stos', 'tot_bytes', 'label','Family']

# Droping specified columns and target variable
X = DM.drop(features_to_drop, axis=1).values    
y = DM['label'].values

In [5]:
#Remove Nan
X = pd.DataFrame(X).dropna()
y = y[X.index]

In [6]:
# Membuat data yang multiclass menjadi binary class
attack_classes = [
    'flow=From-Botnet-V50-1-TCP-Established-SSL-To-Microsoft-1', 
    'flow=From-Botnet-V50-4-TCP-HTTP-Not-Encrypted-Down-2',
    'flow=From-Botnet-V42-UDP-DNS',
    'flow=From-Botnet-V42-TCP-HTTP-Google-Net-Established-6',
    'flow=From-Botnet-V42-TCP-Established',
    'flow=From-Botnet-V50-6-TCP-HTTP-Google-Net-Established-2'
]

# Mmebuat Label Binary
DM['binary_label'] = DM['label'].apply(lambda x: 1 if x in attack_classes else 0)

# Menampilkan rows 1 untuk mengkonformasi label baru
print(DM[['label', 'binary_label']].head())

# Menghitung label Binary
binary_counts = DM['binary_label'].value_counts()
print("Counts of binary labels:")
print(binary_counts)

# Menampilkan sample
print("Sample attack labels:")
print(DM[DM['binary_label'] == 1].head(10))  # Print 10 samples of attack labels
print("Sample benign labels:")
print(DM[DM['binary_label'] == 0].head(10))  # Print 10 samples of benign labels

                                    label  binary_label
0  flow=Background-Established-cmpgw-CVUT             0
1  flow=Background-Established-cmpgw-CVUT             0
2             flow=Background-TCP-Attempt             0
3             flow=Background-TCP-Attempt             0
4             flow=Background-TCP-Attempt             0
Counts of binary labels:
binary_label
0    10573337
1       25434
Name: count, dtype: int64
Sample attack labels:
                dur proto    dir     state  stos  dtos  tot_pkts  tot_bytes  \
418186     0.000278   udp    <->       CON   0.0   0.0         2        203   
418403     0.020525   udp    <->       CON   0.0   0.0         2        590   
418408     0.045125   tcp     ->  SRPA_SPA   0.0   0.0         7        882   
426914     0.336250   udp    <->       CON   0.0   0.0         2        215   
426933  3514.083496   tcp     ->   SPA_SPA   0.0   0.0       120       7767   
428002     0.459301   udp    <->       CON   0.0   0.0         2        212 

In [7]:
# Apply Min-Max scaling to make X non-negative
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Encoding y dengan LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [9]:
# Elastic Net - Embedded
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.05)
elastic_net.fit(X_scaled, y_encoded)

In [10]:
# Mendapatkan support dari Elastic Net (fitur yang koefisiennya tidak 0)
elastic_net_support = np.where(elastic_net.coef_ != 0)[0]

# Mengambil nama kolom dari X yang sudah difilter
filtered_columns = DM.drop(features_to_drop, axis=1).columns

features = []
for idx in elastic_net_support:
    print(f"Feature {filtered_columns[idx]} dengan koefisien Elastic Net {elastic_net.coef_[idx]}")
    features.append(filtered_columns[idx])

Feature dur dengan koefisien Elastic Net -415.22441581161684


In [11]:
# Tentukan 0.1% dari total dataset
total_samples = int(len(DM) * 0.001)

# Ambil jumlah minimal sampel dari kelas yang lebih sedikit
min_class_samples = DM['binary_label'].value_counts().min()

# Pastikan tidak mengambil lebih dari jumlah minimum kelas yang ada
if total_samples / 2 > min_class_samples:
    total_samples = min_class_samples * 2  # Sesuaikan total sampel agar tidak lebih dari yang tersedia

# Ambil sampel secara acak dari kedua kelas dengan jumlah yang seimbang
sampled_data = DM.groupby('binary_label').apply(lambda x: x.sample(n=int(total_samples / 2), random_state=42)).reset_index(drop=True)

# Cek distribusi kelas setelah sampling
print(sampled_data['binary_label'].value_counts())

binary_label
0    5299
1    5299
Name: count, dtype: int64


  sampled_data = DM.groupby('binary_label').apply(lambda x: x.sample(n=int(total_samples / 2), random_state=42)).reset_index(drop=True)


In [12]:
# Pisahkan data menjadi fitur (X) dan target (y)
X = sampled_data[features]
y = sampled_data['binary_label']

# Pisahkan data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [14]:
# Inisialisasi variabel untuk menyimpan hasil evaluasi
hasil_ml_dl = []
hasil_ml_dl_xai = []

# Encode labels ke bentuk numerik jika diperlukan
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fungsi untuk mengevaluasi model ML/DL
def EvaluateModel(model_name, model, X_train, y_train, X_test, y_test, use_xai=False, is_dl_model=False):
    start_time = time.time()

    # Melatih model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0) if is_dl_model else model.fit(X_train, y_train)
    
    if is_dl_model:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    elif hasattr(model, 'predict_proba'):
        # Model dengan metode predict_proba
        y_pred_proba = model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 1:  # Model klasifikasi multi-kelas
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:  # Model klasifikasi biner
            y_pred = (y_pred_proba > 0.5).astype(int).ravel()
    else:
        # Model tanpa metode predict_proba
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).ravel()

    # Menghitung confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Menghitung metrik
    Precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    Recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    F1Score = 2 * ((Precision * Recall) / (Precision + Recall)) if (Precision + Recall) != 0 else 0
    Accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) != 0 else 0

    # Menghitung waktu running
    run_time = time.time() - start_time

    # Jika XAI diperlukan, tambahkan analisis dengan SHAP
    if use_xai:
        # Periksa apakah X_train adalah DataFrame
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns
        else:
            feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]
        
        # Perbaiki format X_train untuk SHAP
        if is_dl_model:
            X_train_for_xai = X_train.reshape((X_train.shape[0], X_train.shape[1]))
            X_test_for_xai = X_test.reshape((X_test.shape[0], X_test.shape[1]))
        else:
            X_train_for_xai = X_train
            X_test_for_xai = X_test

        # Gunakan SHAP
        try:
            if is_dl_model:
                explainer = shap.DeepExplainer(model, X_train_for_xai)
                shap_values = explainer.shap_values(X_test_for_xai)
                shap_summary = np.mean(shap_values[0], axis=0)
            else:
                explainer = shap.Explainer(model.predict_proba, X_train_for_xai)
                shap_values = explainer(X_test_for_xai)
                shap_summary = shap_values.values.mean(axis=0)
        except Exception as e:
            print(f"Error using SHAP with {model_name}: {e}")
            shap_summary = None

        # Simpan hasil evaluasi dengan XAI
        hasil_ml_dl_xai.append([model_name, Precision, Recall, F1Score, Accuracy, run_time, {'SHAP': shap_summary}])
    else:
        # Simpan hasil evaluasi tanpa XAI
        hasil_ml_dl.append([model_name, Precision, Recall, F1Score, Accuracy, run_time])

# Model ML dan DL yang akan dievaluasi
model_ml_dl = {
    "DecisionTree": DecisionTreeClassifier(max_depth=10),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(random_state=0, max_iter=10000),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
    "Stochastic Gradient Descent": SGDClassifier(loss='log_loss', random_state=42),
    "ADA Boost": AdaBoostClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(n_estimators=100),
    "LightGBM": LGBMClassifier(n_estimators=100),
    "CatBoost": CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')
}

model_dl = {
    "DNN": Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    #"CNN": Sequential([
    #    Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    #    MaxPooling1D(pool_size=2),
    #    Flatten(),
     #   Dense(128, activation='relu'),
    #    Dense(1, activation='sigmoid')
    #]),
    "RNN": Sequential([
        LSTM(100, input_shape=(X_train.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ]),
}

# Pastikan X_train dan X_test memiliki bentuk yang sesuai untuk DL
X_train_dl = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_dl = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

# Mengevaluasi model ML tanpa XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=False)

# Mengevaluasi model DL tanpa XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=False, is_dl_model=True)

# Mengevaluasi model ML dengan XAI
for model_name, model in model_ml_dl.items():
    EvaluateModel(model_name, model, X_train, y_train_encoded, X_test, y_test_encoded, use_xai=True)

# Mengevaluasi model DL dengan XAI
for model_name, model in model_dl.items():
    model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])
    EvaluateModel(model_name, model, X_train_dl, y_train_encoded, X_test_dl, y_test_encoded, use_xai=True, is_dl_model=True)

# Print hasil evaluasi tanpa XAI
print("\nHasil Evaluasi ML/DL tanpa XAI:")
print(hasil_ml_dl)

# Print hasil evaluasi dengan XAI
print("\nHasil Evaluasi ML/DL dengan XAI:")
print(hasil_ml_dl_xai)

# Convert results to DataFrame and save to CSV
df_ml_dl = pd.DataFrame(hasil_ml_dl, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime"])
df_ml_dl_xai = pd.DataFrame(hasil_ml_dl_xai, columns=["Model", "Precision", "Recall", "F1Score", "Accuracy", "RunTime", "XAI"])

df_ml_dl.to_csv("hasil_evaluasi_ml_dl_ENFC.csv", index=False)
df_ml_dl_xai.to_csv("hasil_evaluasi_ml_dl_xai_ENFC.csv", index=False)

[LightGBM] [Info] Number of positive: 3733, number of negative: 3685
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7418, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503235 -> initscore=0.012942
[LightGBM] [Info] Start training from score 0.012942
0:	learn: 0.6562775	total: 190ms	remaining: 18.8s
1:	learn: 0.6254255	total: 191ms	remaining: 9.38s
2:	learn: 0.5998611	total: 193ms	remaining: 6.24s
3:	learn: 0.5790303	total: 195ms	remaining: 4.67s
4:	learn: 0.5621130	total: 196ms	remaining: 3.73s
5:	learn: 0.5451802	total: 198ms	remaining: 3.1s
6:	learn: 0.5320178	total: 200ms	remaining: 2.65s
7:	learn: 0.5202495	total: 201ms	remaining: 2.32s
8:	learn: 0.5109044	total: 203ms	remaining: 2.05s


ExactExplainer explainer: 3181it [00:10, 20.07it/s]                                                                    
ExactExplainer explainer: 3181it [00:34, 64.98it/s]                                                                    
ExactExplainer explainer: 3181it [01:44, 27.50it/s]                                                                    


[LightGBM] [Info] Number of positive: 3733, number of negative: 3685
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7418, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503235 -> initscore=0.012942
[LightGBM] [Info] Start training from score 0.012942
0:	learn: 0.6562775	total: 1.76ms	remaining: 174ms
1:	learn: 0.6254255	total: 3.4ms	remaining: 166ms
2:	learn: 0.5998611	total: 4.97ms	remaining: 161ms
3:	learn: 0.5790303	total: 6.59ms	remaining: 158ms
4:	learn: 0.5621130	total: 8.2ms	remaining: 156ms
5:	learn: 0.5451802	total: 10.1ms	remaining: 158ms
6:	learn: 0.5320178	total: 11.7ms	remaining: 156ms
7:	learn: 0.5202495	total: 13.3ms	remaining: 153ms
8:	learn: 0.5109044	total: 15.1ms	remaining: 153ms
9:	learn: 0.5007965	total: 16.9ms	remaining: 152ms
10:	l







Error using SHAP with RNN: in user code:

    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 244, in grad_graph  *
        out = self.model(shap_rAnD)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 371, in custom_grad
        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\Python311\Lib\site-packages\shap\explainers\_deep\deep_tf.py", line 663, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "C:\Users\Muhammad Raihan\AppData\Local\Programs\Python\P