In [2]:
import sys
import os
import time
import pickle
import glob

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import *
from flow_analysis import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [3]:
def prepare_data(flows_path):
    # TODO: extract malicious flows metadata by reading directory names in flows_path
     
    # Read data
    benign_df = pd.read_parquet(f'{flows_path}/benign')
    malicious_df = pd.read_parquet(f'{flows_path}/malicious')

    # Label the data
    benign_df['label'] = 0  # BENIGN
    malicious_df['label'] = 1  # MALICIOUS

    # Combine datasets
    combined_df = pd.concat([benign_df, malicious_df], ignore_index=True)

    # Filter out flows where packets_count is less than 3
    combined_df = combined_df[combined_df['packets_count'] >= 3]

    # Separate features and labels
    labels = combined_df['label'].values
    features_df = combined_df.drop(['label'], axis=1)

    # Convert DataFrame to numpy array using flows_df_to_np
    features, metas = flows_df_to_np(features_df)
    
    return features, labels, metas

In [4]:
dataset = "icsx-botnet-2014"
print (f"Preparing data for dataset: {dataset}")

# Prepare data
train_features, train_labels, train_meta = prepare_data(f'./../flows/train/{dataset}')
test_features, test_labels, test_meta = prepare_data(f'./../flows/test/{dataset}')

# Print the shape of the data
print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

train_malicious_count = len(train_labels[train_labels == 1])
train_benign_count = len(train_labels[train_labels == 0])
test_malicious_count = len(test_labels[test_labels == 1])
test_benign_count = len(test_labels[test_labels == 0])

# Print number of malicious flows in train and test sets
print("Training:")
print(f"    # malicious flows: {train_malicious_count} ({train_malicious_count / len(train_labels) * 100:.2f}%)")
print(f"    # benign flows: {train_benign_count} ({train_benign_count / len(train_labels) * 100:.2f}%)")

print("Testing:")
print(f"    # malicious flows: {test_malicious_count} ({test_malicious_count / len(test_labels) * 100:.2f}%)")
print(f"    # benign flows: {test_benign_count} ({test_benign_count / len(test_labels) * 100:.2f}%)")

# Fit Min-Max scaling
scaler = MinMaxScaler(feature_range=(0,1)).fit(train_features)

os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
with open(f'./../artifacts/{dataset}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

pca_12 = PCA(n_components=12).fit(train_features)
os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
with open(f'./../artifacts/{dataset}/pca_12.pkl', 'wb') as f:
    pickle.dump(pca_12, f)

# pca_16 = PCA(n_components=16).fit(train_features)
# os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
# with open(f'./../artifacts/{dataset}/pca_16.pkl', 'wb') as f:
#     pickle.dump(pca_16, f)

train_features_pca_12 = pca_12.transform(train_features)
test_features_pca_12 = pca_12.transform(test_features)

# train_features_pca_16 = pca_16.transform(train_features)
# test_features_pca_16 = pca_16.transform(test_features)

train_features.shape[1], train_features_pca_12.shape[1] #, train_features_pca_16.shape[1]

(36, 12, 16)

In [32]:
model_name = "dnn_24_24_24.keras"
epochs = 4

# Build the neural network model
dnn = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,  
    epochs=epochs,
    verbose=1
)

dnn.save(f'./../artifacts/{dataset}/{model_name}')

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=epochs,
    verbose=1
)

dnn_pca_12.save(f'./../artifacts/{dataset}/pca_12_{model_name}')

Epoch 1/4
3519/3519 ━━━━━━━━━━━━━━━━━━━━ 1:25:15 1s/step - AUC: 0.4400 - Precision: 0.2727 - Recall: 0.8571 - accuracy: 0.4688 - loss: 0.694 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5627 - Precision: 0.2237 - Recall: 0.1156 - accuracy: 0.7856 - loss: 0.6356    ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5632 - Precision: 0.2230 - Recall: 0.0707 - accuracy: 0.8048 - loss: 0.587 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5833 - Precision: 0.2228 - Recall: 0.0535 - accuracy: 0.8109 - loss: 0.553 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - AUC: 0.6132 - Precision: 0.2565 - Recall: 0.0475 - accuracy: 0.8142 - loss: 0.526 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.6536 - Precision: 0.3930 - Recall: 0.0710 - accuracy: 0.8216 - loss: 0.494 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - AUC: 0.6788 - Precision: 0.4691 - Recall: 0.0968 - accuracy: 0.8279 - loss: 0.475 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - AUC: 0.6965 - Precision: 0.5176 - Recall: 0.1185 - accuracy: 0.8329 - loss: 0.461 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - A

In [33]:
print("DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (24x24x24): AUROC 0.9860966331892551
PCA (12) + DNN (24x24x24): AUROC 0.9908839778823137


In [30]:
model_name = "dnn_16_16_16.keras"
epochs = 4

# Build the neural network model
dnn = tf.keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(train_features.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,  
    epochs=epochs,
    verbose=1
)

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=epochs,
    verbose=1
)

dnn.save(f'./../artifacts/{dataset}/{model_name}')
dnn_pca_12.save(f'./../artifacts/{dataset}/pca_12_{model_name}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/4
3519/3519 ━━━━━━━━━━━━━━━━━━━━ 1:21:52 1s/step - AUC: 0.3851 - Precision: 0.0833 - Recall: 0.3333 - accuracy: 0.5938 - loss: 0.690 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.3633 - Precision: 0.0798 - Recall: 0.0360 - accuracy: 0.7806 - loss: 0.6668    ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.4119 - Precision: 0.0799 - Recall: 0.0212 - accuracy: 0.8040 - loss: 0.637 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.4423 - Precision: 0.0799 - Recall: 0.0159 - accuracy: 0.8110 - loss: 0.610 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.4799 - Precision: 0.0800 - Recall: 0.0126 - accuracy: 0.8150 - loss: 0.582 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5211 - Precision: 0.1630 - Recall: 0.0188 - accuracy: 0.8187 - loss: 0.556 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5593 - Precision: 0.2817 - Recall: 0.0431 - accuracy: 0.8244 - loss: 0.533 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.5918 - Precision: 0.3714 - Recall: 0.0719 - accuracy: 0.8305 - loss: 0.513 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - A

In [31]:
print("DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (16x16x16): AUROC 0.987785495044037
PCA (12) + DNN (16x16x16): AUROC 0.9829081116324724


In [35]:
models = {
    "log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features, train_labels),
    "dt_9": DecisionTreeClassifier(max_depth=9).fit(train_features, train_labels),
    "rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features, train_labels)
}

pca_models = {
    "pca_12_log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features_pca_12, train_labels),
    "pca_12_dt_9": DecisionTreeClassifier(max_depth=9).fit(train_features_pca_12, train_labels),
    "pca_12_rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features_pca_12, train_labels)
}

for model_name, model in {**models, **pca_models}.items():
    with open(f'./../artifacts/{dataset}/{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# NOTE: predict_proba returns probabilities for both classes, so [:, 1] is needed
for name, model in models.items():
    print(name + 
        ": R2 score " + str(model.score(test_features, test_labels)) + 
        ", AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features))))

for name, model in pca_models.items():
    print(name + 
        ", AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features_pca_12)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features_pca_12))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features_pca_12))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features_pca_12))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features_pca_12))))

log_reg: R2 score 0.8828545145768468, AUROC 0.9383509640219992, Accuracy 0.8828545145768468, Precision 0.9942201356285864, Recall 0.8314951835822979, F1 0.9056058691045245
dt_9: R2 score 0.7776794980658552, AUROC 0.9423504511206656, Accuracy 0.7776794980658552, Precision 0.9921417052755522, Recall 0.6763925729442971, F1 0.8043913625468243
rf_9: R2 score 0.763727710161336, AUROC 0.9776757432573321, Accuracy 0.763727710161336, Precision 0.9992766048655021, Recall 0.6508620689655172, F1 0.7882867650166969
pca_12_log_reg, AUROC 0.9367110571275441, Accuracy 0.8799650910463251, Precision 0.9945223304231028, Recall 0.8269405277118526, F1 0.9030223340193613
pca_12_dt_9, AUROC 0.7416381245625113, Accuracy 0.7620530238701765, Precision 0.9949080245267928, Recall 0.6512459863185815, F1 0.7872044212879954
pca_12_rf_9, AUROC 0.990337582753611, Accuracy 0.7597296914803283, Precision 0.9976821281298008, Recall 0.6459758481083345, F1 0.7842003241285074
