In [1]:
dataset = "icsx-ctu-extended"

import sys
import os
import time
import pickle
import glob

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import *
from flow_analysis import *

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC


In [4]:
def prepare_data(flows_path):
    # TODO: extract malicious flows metadata by reading directory names in flows_path
     
    # Read data
    benign_df = pd.read_parquet(f'{flows_path}/benign')
    malicious_df = pd.read_parquet(f'{flows_path}/malicious')

    # Label the data
    benign_df['label'] = 0  # BENIGN
    malicious_df['label'] = 1  # MALICIOUS

    # Combine datasets
    combined_df = pd.concat([benign_df, malicious_df], ignore_index=True)

    # Filter out flows where packets_count is less than 3
    combined_df = combined_df[combined_df['packets_count'] >= 3]

    # Separate features and labels
    labels = combined_df['label'].values
    features_df = combined_df.drop(['label'], axis=1)

    # Convert DataFrame to numpy array using flows_df_to_np
    features, metas = flows_df_to_np(features_df)
    
    return features, labels, metas

In [6]:
print (f"Preparing data for dataset: {dataset}")

# Prepare data
train_features, train_labels, train_meta = prepare_data(f'./../flows/train/{dataset}')
test_features, test_labels, test_meta = prepare_data(f'./../flows/test/{dataset}')

# Print the shape of the data
print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

train_malicious_count = len(train_labels[train_labels == 1])
train_benign_count = len(train_labels[train_labels == 0])
test_malicious_count = len(test_labels[test_labels == 1])
test_benign_count = len(test_labels[test_labels == 0])

# Print number of malicious flows in train and test sets
print("Training:")
print(f"    # malicious flows: {train_malicious_count} ({train_malicious_count / len(train_labels) * 100:.2f}%)")
print(f"    # benign flows: {train_benign_count} ({train_benign_count / len(train_labels) * 100:.2f}%)")

print("Testing:")
print(f"    # malicious flows: {test_malicious_count} ({test_malicious_count / len(test_labels) * 100:.2f}%)")
print(f"    # benign flows: {test_benign_count} ({test_benign_count / len(test_labels) * 100:.2f}%)")

# Fit Min-Max scaling
scaler = MinMaxScaler(feature_range=(0,1)).fit(train_features)

with open(f'./../artifacts/{dataset}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

pca_path = f'./../artifacts/{dataset}/pca_12.pkl'

# Load the PCA from file
with open(pca_path, 'rb') as f:
    pca = pickle.load(f)

pca_12 = PCA(n_components=12).fit(train_features)
os.makedirs(f'./../artifacts/{dataset}', exist_ok=True)
with open(f'./../artifacts/{dataset}/pca_12.pkl', 'wb') as f:
    pickle.dump(pca_12, f)

Preparing data for dataset: icsx-ctu-extended
Train features shape: (289564, 36)
Test features shape: (623300, 36)
Training:
    # malicious flows: 150386 (51.94%)
    # benign flows: 139178 (48.06%)
Testing:
    # malicious flows: 576823 (92.54%)
    # benign flows: 46477 (7.46%)


In [7]:
dnn16 = tf.keras.models.load_model(f'./../artifacts/{dataset}/dnn_16_16_16.keras')
dnn24 = tf.keras.models.load_model(f'./../artifacts/{dataset}/dnn_24_24_24.keras')
rf9 = pickle.load(open(f'./../artifacts/{dataset}/rf_9.pkl', 'rb'))
pca12_rf9 = pickle.load(open(f'./../artifacts/{dataset}/pca_12_rf_9.pkl', 'rb'))

In [8]:
start_time = time.time()
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)
print(f"Time spent for scaling: {time.time() - start_time:.2f} seconds")

start_time = time.time()
train_features_pca_12 = pca_12.transform(train_features)
test_features_pca_12 = pca_12.transform(test_features)
print(f"Time spent for PCA transformation: {time.time() - start_time:.2f} seconds")

Time spent for scaling: 0.05 seconds
Time spent for PCA transformation: 0.03 seconds


In [18]:
start_time = time.time()
predictions16 = dnn16.predict(test_features, verbose=0)
time_spent16 = time.time() - start_time
print(f"Time for DNN 16x16x16 (test): {time_spent16:.2f} seconds")
items_per_second = len(predictions16) / time_spent16
print(f"Items predicted per second: {items_per_second:.2f}")

Time for DNN 16x16x16 (test): 5.01 seconds
Items predicted per second: 124427.51


In [21]:
batch_size = 64
predictions16 = []

start_time = time.time()
for i in range(0, len(test_features), batch_size):
    batch = test_features[i:i+batch_size]
    batch_pred = dnn16.predict(batch, verbose=0)
    predictions16.append(batch_pred)

# Concatenate all batches into a single array
predictions16 = np.concatenate(predictions16, axis=0)

time_spent16 = time.time() - start_time
print(f"Time for DNN 16x16x16 (test): {time_spent16:.2f} seconds")
items_per_second = len(predictions16) / time_spent16
print(f"Items predicted per second: {items_per_second:.2f}")

Time for DNN 16x16x16 (test): 296.16 seconds
Items predicted per second: 2104.60


In [None]:
start_time = time.time()
predictions24 = dnn24.predict(test_features, verbose=0)
time_spent24 = time.time() - start_time
print(f"Time for DNN 24x24x24 (test): {time_spent24} seconds")
items_per_second = len(predictions24) / time_spent24
print(f"Items predicted per second: {items_per_second:.2f}")

Time for DNN 24x24x24 (test): 5.059830904006958 seconds
Items predicted per second: 123185.93


In [22]:
batch_size = 64
predictions24 = []

start_time = time.time()
for i in range(0, len(test_features), batch_size):
    batch = test_features[i:i+batch_size]
    batch_pred = dnn24.predict(batch, verbose=0)
    predictions24.append(batch_pred)

# Concatenate all batches into a single array
predictions24 = np.concatenate(predictions24, axis=0)

time_spent24 = time.time() - start_time
print(f"Time for DNN 24x24x24 (test): {time_spent24} seconds")
items_per_second = len(predictions24) / time_spent24
print(f"Items predicted per second: {items_per_second:.2f}")

Time for DNN 24x24x24 (test): 300.215380191803 seconds
Items predicted per second: 2076.18


In [None]:
start_time = time.time()
rf9_predictions = rf9.predict(test_features)
time_spent_rf9 = time.time() - start_time
print(f"Time for RF_9 (test): {time_spent_rf9:.2f} seconds")
items_per_second = len(rf9_predictions) / time_spent_rf9
print(f"Items predicted per second: {items_per_second:.2f}")

Time for RF_9 (test): 0.77 seconds
Items predicted per second: 805748.21


In [24]:
batch_size = 64
rf9_predictions = []

start_time = time.time()
for i in range(0, len(test_features), batch_size):
    batch = test_features[i:i+batch_size]
    batch_pred = rf9.predict(batch)
    rf9_predictions.append(batch_pred)

# Concatenate all batches into a single array
rf9_predictions = np.concatenate(rf9_predictions, axis=0)

time_spent_rf9 = time.time() - start_time
print(f"Time for RF_9 (test): {time_spent_rf9:.2f} seconds")
items_per_second = len(rf9_predictions) / time_spent_rf9
print(f"Items predicted per second: {items_per_second:.2f}")

Time for RF_9 (test): 14.72 seconds
Items predicted per second: 42344.27


In [20]:
start_time = time.time()
pca12_rf9_predictions = pca12_rf9.predict(test_features_pca_12)
time_spent_pca12_rf9 = time.time() - start_time
print(f"Time for PCA_12 + RF_9 (test): {time_spent_pca12_rf9:.2f} seconds")
items_per_second = len(pca12_rf9_predictions) / time_spent_pca12_rf9
print(f"Items predicted per second: {items_per_second:.2f}")

Time for PCA_12 + RF_9 (test): 0.54 seconds
Items predicted per second: 1144971.87


In [28]:
batch_size = 64
pca12_rf9_predictions = []

start_time = time.time()
for i in range(0, len(test_features_pca_12), batch_size):
    batch = test_features_pca_12[i:i+batch_size]
    batch_pred = pca12_rf9.predict(batch)
    pca12_rf9_predictions.append(batch_pred)

# Concatenate all batches into a single array
pca12_rf9_predictions = np.concatenate(pca12_rf9_predictions, axis=0)

time_spent_pca12_rf9 = time.time() - start_time
print(f"Time for PCA_12 + RF_9 (test): {time_spent_pca12_rf9:.2f} seconds")
items_per_second = len(pca12_rf9_predictions) / time_spent_pca12_rf9
print(f"Items predicted per second: {items_per_second:.2f}")

Time for PCA_12 + RF_9 (test): 14.18 seconds
Items predicted per second: 43970.61


In [15]:
print("DNN (24x24x24): AUROC " + str(roc_auc_score(test_labels, predictions24)))
print("DNN (16x16x16): AUROC " + str(roc_auc_score(test_labels, predictions16)))

DNN (24x24x24): AUROC 0.97511784624462
DNN (16x16x16): AUROC 0.9713893276719773
