In [9]:
import sys
import os
import time
import pickle
import glob

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import *
from flow_analysis import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [10]:
def prepare_data(flows_path):
    # TODO: extract malicious flows metadata by reading directory names in flows_path
     
    # Read data
    benign_df = pd.read_parquet(f'{flows_path}/benign')
    malicious_df = pd.read_parquet(f'{flows_path}/malicious')

    # Label the data
    benign_df['label'] = 0  # BENIGN
    malicious_df['label'] = 1  # MALICIOUS

    # Combine datasets
    combined_df = pd.concat([benign_df, malicious_df], ignore_index=True)

    # Filter out flows where packets_count is less than 3
    combined_df = combined_df[combined_df['packets_count'] >= 3]

    # Separate features and labels
    labels = combined_df['label'].values
    features_df = combined_df.drop(['label'], axis=1)

    # Convert DataFrame to numpy array using flows_df_to_np
    features, metas = flows_df_to_np(features_df)
    
    return features, labels, metas

In [11]:
dataset = "icsx-botnet-2014"
print (f"Preparing data for dataset: {dataset}")

# Prepare data
train_features, train_labels, train_meta = prepare_data(f'./../flows_udp/train/{dataset}')
test_features, test_labels, test_meta = prepare_data(f'./../flows_udp/test/{dataset}')

# Print the shape of the data
print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

train_malicious_count = len(train_labels[train_labels == 1])
train_benign_count = len(train_labels[train_labels == 0])
test_malicious_count = len(test_labels[test_labels == 1])
test_benign_count = len(test_labels[test_labels == 0])

# Print number of malicious flows in train and test sets
print("Training:")
print(f"    # malicious flows: {train_malicious_count} ({train_malicious_count / len(train_labels) * 100:.2f}%)")
print(f"    # benign flows: {train_benign_count} ({train_benign_count / len(train_labels) * 100:.2f}%)")

print("Testing:")
print(f"    # malicious flows: {test_malicious_count} ({test_malicious_count / len(test_labels) * 100:.2f}%)")
print(f"    # benign flows: {test_benign_count} ({test_benign_count / len(test_labels) * 100:.2f}%)")

# Fit Min-Max scaling
scaler = MinMaxScaler(feature_range=(0,1)).fit(train_features)

os.makedirs(f'./../artifacts_udp/{dataset}', exist_ok=True)
with open(f'./../artifacts_udp/{dataset}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

pca_12 = PCA(n_components=12).fit(train_features)
os.makedirs(f'./../artifacts_udp/{dataset}', exist_ok=True)
with open(f'./../artifacts_udp/{dataset}/pca_12.pkl', 'wb') as f:
    pickle.dump(pca_12, f)

# pca_16 = PCA(n_components=16).fit(train_features)
# os.makedirs(f'./../artifacts_udp/{dataset}', exist_ok=True)
# with open(f'./../artifacts_udp/{dataset}/pca_16.pkl', 'wb') as f:
#     pickle.dump(pca_16, f)

train_features_pca_12 = pca_12.transform(train_features)
test_features_pca_12 = pca_12.transform(test_features)

# train_features_pca_16 = pca_16.transform(train_features)
# test_features_pca_16 = pca_16.transform(test_features)

train_features.shape[1], train_features_pca_12.shape[1] #, train_features_pca_16.shape[1]

Preparing data for dataset: icsx-botnet-2014
Train features shape: (22626, 36)
Test features shape: (27057, 36)
Training:
    # malicious flows: 5653 (24.98%)
    # benign flows: 16973 (75.02%)
Testing:
    # malicious flows: 6984 (25.81%)
    # benign flows: 20073 (74.19%)


(36, 12)

In [12]:
model_name = "dnn_24_24_24.keras"
epochs = 3

# Build the neural network model
dnn = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,  
    epochs=epochs,
    verbose=1
)

dnn.save(f'./../artifacts_udp/{dataset}/{model_name}')

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=epochs,
    verbose=1
)

dnn_pca_12.save(f'./../artifacts_udp/{dataset}/pca_12_{model_name}')

Epoch 1/3
566/566 ━━━━━━━━━━━━━━━━━━━━ 5:07 544ms/step - AUC: 0.5000 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9688 - loss: 0.676 ━━━━━━━━━━━━━━━━━━━━ 0s 554us/step - AUC: 0.5683 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9396 - loss: 0.5435  ━━━━━━━━━━━━━━━━━━━━ 0s 507us/step - AUC: 0.5757 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9387 - loss: 0.432 ━━━━━━━━━━━━━━━━━━━━ 0s 512us/step - AUC: 0.6044 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9379 - loss: 0.382 ━━━━━━━━━━━━━━━━━━━━ 0s 506us/step - AUC: 0.6329 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9377 - loss: 0.348 ━━━━━━━━━━━━━━━━━━━━ 0s 511us/step - AUC: 0.6546 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9378 - loss: 0.326 ━━━━━━━━━━━━━━━━━━━━ 1s 927us/step - AUC: 0.6682 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.9379 - loss: 0.3132 - val_AUC: 0.0000e+00 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_accurac

In [13]:
print("DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (24x24x24): AUROC 0.5995127342759067
PCA (12) + DNN (24x24x24): AUROC 0.541264547631386


In [14]:
model_name = "dnn_16_16_16.keras"
epochs = 3

# Build the neural network model
dnn = tf.keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(train_features.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.2,  
    epochs=epochs,
    verbose=1
)

dnn_pca_12 = tf.keras.Sequential([
    tf.keras.Input(shape=(train_features_pca_12.shape[-1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

dnn_pca_12.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])
history = dnn_pca_12.fit(
    train_features_pca_12, train_labels,
    validation_split=0.2,
    epochs=epochs,
    verbose=1
)

dnn.save(f'./../artifacts_udp/{dataset}/{model_name}')
dnn_pca_12.save(f'./../artifacts_udp/{dataset}/pca_12_{model_name}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/3
566/566 ━━━━━━━━━━━━━━━━━━━━ 5:09 547ms/step - AUC: 0.1782 - Precision: 0.0968 - Recall: 1.0000 - accuracy: 0.1250 - loss: 0.741 ━━━━━━━━━━━━━━━━━━━━ 0s 538us/step - AUC: 0.4540 - Precision: 0.0587 - Recall: 0.2324 - accuracy: 0.6834 - loss: 0.6452  ━━━━━━━━━━━━━━━━━━━━ 0s 500us/step - AUC: 0.4900 - Precision: 0.0564 - Recall: 0.1377 - accuracy: 0.7887 - loss: 0.531 ━━━━━━━━━━━━━━━━━━━━ 0s 489us/step - AUC: 0.5215 - Precision: 0.0557 - Recall: 0.1008 - accuracy: 0.8284 - loss: 0.465 ━━━━━━━━━━━━━━━━━━━━ 0s 484us/step - AUC: 0.5517 - Precision: 0.0554 - Recall: 0.0805 - accuracy: 0.8507 - loss: 0.421 ━━━━━━━━━━━━━━━━━━━━ 0s 481us/step - AUC: 0.5779 - Precision: 0.0552 - Recall: 0.0675 - accuracy: 0.8646 - loss: 0.391 ━━━━━━━━━━━━━━━━━━━━ 1s 887us/step - AUC: 0.5872 - Precision: 0.0551 - Recall: 0.0636 - accuracy: 0.8689 - loss: 0.3817 - val_AUC: 0.0000e+00 - val_Precision: 0.0000e+00 - val_Recall: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 2.4742
Epoch 2/3
566/566 ━━━━━

In [15]:
print("DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn.predict(test_features, verbose=0))))
print("PCA (12) + DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, dnn_pca_12.predict(test_features_pca_12, verbose=0))))

DNN (16x16x16): AUROC 0.5996406287154977
PCA (12) + DNN (16x16x16): AUROC 0.5060319353260941


In [16]:
models = {
    "log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features, train_labels),
    "dt_9": DecisionTreeClassifier(max_depth=9).fit(train_features, train_labels),
    "rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features, train_labels)
}

pca_models = {
    "pca_12_log_reg": LogisticRegression(random_state=0, max_iter=200).fit(train_features_pca_12, train_labels),
    "pca_12_dt_9": DecisionTreeClassifier(max_depth=9).fit(train_features_pca_12, train_labels),
    "pca_12_rf_9": RandomForestClassifier(max_depth=9, random_state=0).fit(train_features_pca_12, train_labels)
}

for model_name, model in {**models, **pca_models}.items():
    with open(f'./../artifacts_udp/{dataset}/{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# NOTE: predict_proba returns probabilities for both classes, so [:, 1] is needed
for name, model in models.items():
    print(name + 
        ": R2 score " + str(model.score(test_features, test_labels)) + 
        ", AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features))))

for name, model in pca_models.items():
    print(name + 
        ", AUROC " + str(roc_auc_score(test_labels, model.predict_proba(test_features_pca_12)[:, 1])) +
        ", Accuracy " + str(accuracy_score(test_labels, model.predict(test_features_pca_12))) +
        ", Precision " + str(precision_score(test_labels, model.predict(test_features_pca_12))) +
        ", Recall " + str(recall_score(test_labels, model.predict(test_features_pca_12))) +
        ", F1 " + str(f1_score(test_labels, model.predict(test_features_pca_12))))

log_reg: R2 score 0.7274642421554496, AUROC 0.6405418012056682, Accuracy 0.7274642421554496, Precision 0.2639225181598063, Recall 0.031214203894616266, F1 0.0558258642765685
dt_9: R2 score 0.7479025760431681, AUROC 0.5848635049366491, Accuracy 0.7479025760431681, Precision 0.5163819095477387, Recall 0.3678407789232532, F1 0.4296345848315076
rf_9: R2 score 0.7719259341390398, AUROC 0.6432491302222261, Accuracy 0.7719259341390398, Precision 0.5946008843379101, Recall 0.365836197021764, F1 0.45297402712525486
pca_12_log_reg, AUROC 0.6406919975480104, Accuracy 0.7274272831429944, Precision 0.26417370325693607, Recall 0.0313573883161512, F1 0.05606041213362345
pca_12_dt_9, AUROC 0.5449782620468508, Accuracy 0.7207377018886055, Precision 0.4371152154793316, Recall 0.28465063001145474, F1 0.3447797433229275
pca_12_rf_9, AUROC 0.4805521309134602, Accuracy 0.7313079794507891, Precision 0.46622579121398205, Recall 0.28264604810996563, F1 0.351934391157069
