In [9]:
dataset = "icsx-ctu-extended"

import sys
import os
import time
import pickle
import glob

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import *
from flow_analysis import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [10]:
def prepare_data(flows_path):
    # TODO: extract malicious flows metadata by reading directory names in flows_path
     
    # Read data
    benign_df = pd.read_parquet(f'{flows_path}/benign')
    malicious_df = pd.read_parquet(f'{flows_path}/malicious')

    # Label the data
    benign_df['label'] = 0  # BENIGN
    malicious_df['label'] = 1  # MALICIOUS

    # Combine datasets
    combined_df = pd.concat([benign_df, malicious_df], ignore_index=True)

    # Filter out flows where packets_count is less than 3
    combined_df = combined_df[combined_df['packets_count'] >= 3]

    # Separate features and labels
    labels = combined_df['label'].values
    features_df = combined_df.drop(['label'], axis=1)

    # Convert DataFrame to numpy array using flows_df_to_np
    features, metas = flows_df_to_np(features_df)
    
    return features, labels, metas

In [11]:
print (f"Preparing data for dataset: {dataset}")

# Prepare data
train_features, train_labels, train_meta = prepare_data(f'./../flows/train/{dataset}')
test_features, test_labels, test_meta = prepare_data(f'./../flows/test/{dataset}')

# Print the shape of the data
print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

train_malicious_count = len(train_labels[train_labels == 1])
train_benign_count = len(train_labels[train_labels == 0])
test_malicious_count = len(test_labels[test_labels == 1])
test_benign_count = len(test_labels[test_labels == 0])

# Print number of malicious flows in train and test sets
print("Training:")
print(f"    # malicious flows: {train_malicious_count} ({train_malicious_count / len(train_labels) * 100:.2f}%)")
print(f"    # benign flows: {train_benign_count} ({train_benign_count / len(train_labels) * 100:.2f}%)")

print("Testing:")
print(f"    # malicious flows: {test_malicious_count} ({test_malicious_count / len(test_labels) * 100:.2f}%)")
print(f"    # benign flows: {test_benign_count} ({test_benign_count / len(test_labels) * 100:.2f}%)")

# Fit Min-Max scaling
scaler = MinMaxScaler(feature_range=(0,1)).fit(train_features)

os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
with open(f'./../artifacts/{dataset}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

pca_12 = PCA(n_components=12).fit(train_features)
os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
with open(f'./../artifacts/{dataset}/pca_12.pkl', 'wb') as f:
    pickle.dump(pca_12, f)

# pca_16 = PCA(n_components=16).fit(train_features)
# os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
# with open(f'./../artifacts/{dataset}/pca_16.pkl', 'wb') as f:
#     pickle.dump(pca_16, f)

train_features_pca_12 = pca_12.transform(train_features)
test_features_pca_12 = pca_12.transform(test_features)

# train_features_pca_16 = pca_16.transform(train_features)
# test_features_pca_16 = pca_16.transform(test_features)

Preparing data for dataset: icsx-ctu-extended
Train features shape: (289564, 36)
Test features shape: (600259, 36)
Training:
    # malicious flows: 150386 (51.94%)
    # benign flows: 139178 (48.06%)
Testing:
    # malicious flows: 553782 (92.26%)
    # benign flows: 46477 (7.74%)


In [12]:
model_name = "dnn_24_24_24.keras"

# Build the neural network model
dnn = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,  
    epochs=3,
    verbose=1
)

dnn.save(f'./../artifacts/{dataset}/{model_name}')

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=3,
    verbose=1
)

dnn_pca_12.save(f'./../artifacts/{dataset}/pca_12_{model_name}')

Epoch 1/3
7240/7240 ━━━━━━━━━━━━━━━━━━━━ 3:23:50 2s/step - AUC: 0.3765 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.3750 - loss: 0.693 ━━━━━━━━━━━━━━━━━━━━ 17s 2ms/step - AUC: 0.6645 - Precision: 0.0695 - Recall: 0.0095 - accuracy: 0.5494 - loss: 0.6849          ━━━━━━━━━━━━━━━━━━━━ 11s 2ms/step - AUC: 0.6852 - Precision: 0.1005 - Recall: 0.0065 - accuracy: 0.5856 - loss: 0.67 ━━━━━━━━━━━━━━━━━━━━ 10s 1ms/step - AUC: 0.7169 - Precision: 0.3450 - Recall: 0.0428 - accuracy: 0.6076 - loss: 0.65 ━━━━━━━━━━━━━━━━━━━━ 10s 1ms/step - AUC: 0.7472 - Precision: 0.4880 - Recall: 0.1063 - accuracy: 0.6334 - loss: 0.63 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7738 - Precision: 0.5719 - Recall: 0.1776 - accuracy: 0.6593 - loss: 0.6074 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7951 - Precision: 0.6264 - Recall: 0.2417 - accuracy: 0.6820 - loss: 0.583 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.8116 - Precision: 0.6642 - Recall: 0.2945 - accuracy: 0.7009 - loss: 0.562 ━━━━━━━━━━━━━━━━━━━━

In [13]:
print("DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (24x24x24): AUROC 0.7781895229709166
PCA (12) + DNN (24x24x24): AUROC 0.9726919737428559


In [14]:
model_name = "dnn_16_16_16.keras"

# Build the neural network model
dnn = tf.keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(train_features.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,
    epochs=3,
    verbose=1
)

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=3,
    verbose=1
)

dnn.save(f'./../artifacts/{dataset}/{model_name}')
dnn_pca_12.save(f'./../artifacts/{dataset}/pca_12_{model_name}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/3
7240/7240 ━━━━━━━━━━━━━━━━━━━━ 2:46:28 1s/step - AUC: 0.6039 - Precision: 0.4688 - Recall: 1.0000 - accuracy: 0.4688 - loss: 0.693 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7058 - Precision: 0.5007 - Recall: 0.8356 - accuracy: 0.5733 - loss: 0.6851    ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7219 - Precision: 0.5614 - Recall: 0.7085 - accuracy: 0.6335 - loss: 0.672 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7371 - Precision: 0.6030 - Recall: 0.6540 - accuracy: 0.6625 - loss: 0.658 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7498 - Precision: 0.6346 - Recall: 0.6248 - accuracy: 0.6815 - loss: 0.643 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.7638 - Precision: 0.6588 - Recall: 0.6132 - accuracy: 0.6969 - loss: 0.625 ━━━━━━━━━━━━━━━━━━━━ 8s 1ms/step - AUC: 0.7778 - Precision: 0.6780 - Recall: 0.6139 - accuracy: 0.7100 - loss: 0.608 ━━━━━━━━━━━━━━━━━━━━ 8s 1ms/step - AUC: 0.7910 - Precision: 0.6944 - Recall: 0.6206 - accuracy: 0.7224 - loss: 0.591 ━━━━━━━━━━━━━━━━━━━━ 8s 1ms/step - A

In [15]:
print("DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (16x16x16): AUROC 0.9717491209498125
PCA (12) + DNN (16x16x16): AUROC 0.9730145160093551


In [16]:
models = {
    "log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features, train_labels),
    "rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features, train_labels)
}

pca_models = {
    "pca_12_log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features_pca_12, train_labels),
    "pca_12_rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features_pca_12, train_labels)
}

for model_name, model in {**models, **pca_models}.items():
    with open(f'./../artifacts/{dataset}/{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# NOTE: predict_proba returns probabilities for both classes, so [:, 1] is needed
for name, model in models.items():
    print(name + 
        ": AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features))))

for name, model in pca_models.items():
    print(name + 
        ": AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features_pca_12)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features_pca_12))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features_pca_12))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features_pca_12))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features_pca_12))))

log_reg: AUROC 0.8324475441741848, Accuracy 0.8566285553402782, Precision 0.9752731396426423, Recall 0.8665666273009957, F1 0.9177119209667272
rf_9: AUROC 0.9408362106210955, Accuracy 0.8992768121760774, Precision 0.9755479209081375, Recall 0.9137259788147682, F1 0.9436254627169058
pca_12_log_reg: AUROC 0.8438889823286105, Accuracy 0.820912306187829, Precision 0.9796689158833102, Recall 0.8229610207626827, F1 0.894503430387809
pca_12_rf_9: AUROC 0.961827789095228, Accuracy 0.8473725508488835, Precision 0.9983760071774875, Recall 0.8359227999465494, F1 0.9099556343579169
