In [4]:
import sys
import os
import time
import pickle

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from flow_features import *
from flow_analysis import *

# test_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'flows', 'train', 'malicious', 'Neris'))
# test = pd.read_parquet(test_path)
# test

In [5]:
!where python

c:\Users\Wind\net-watcher\Scripts\python.exe
C:\Users\Wind\AppData\Local\Programs\Python\Python311\python.exe
C:\Users\Wind\AppData\Local\Microsoft\WindowsApps\python.exe


In [11]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score

# Paths to data directories
benign_path = './../flows/train/benign'
malicious_path = './../flows/train/malicious'

# Read data
benign_df = pd.read_parquet(benign_path)
malicious_df = pd.read_parquet(malicious_path)

# Label the data
benign_df['label'] = 0  # BENIGN
malicious_df['label'] = 1  # MALICIOUS

# Combine datasets
combined_df = pd.concat([benign_df, malicious_df], ignore_index=True)

# Filter out flows where packets_count is less than 3
combined_df = combined_df[combined_df['packets_count'] >= 3]

# Save feature types only
feature_types = combined_df.drop(['label'], axis=1).dtypes.apply(lambda x: x.name).to_dict()
with open('./../artifacts/feature_types.txt', 'w') as f:
    for name, dtype in feature_types.items():
        f.write(f"{name}: {dtype}\n")

# Separate features and labels
labels = combined_df['label'].values
features_df = combined_df.drop(['label'], axis=1)

# Convert DataFrame to numpy array using flows_df_to_np
features, metas = flows_df_to_np(features_df)

# Apply Min-Max scaling
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

# Save the scaler
with open('./../artifacts/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Prepare TensorFlow dataset
train_features = scaled_features
train_labels = labels
train_meta = metas  # Auxiliary meta data

# Build the neural network model
dnn = tf.keras.Sequential([
    layers.Dense(24, activation='relu', input_shape=(train_features.shape[-1],)),
    layers.Dense(24, activation='relu'),
    layers.Dense(24, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

# Add additional metrics
dnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC', 'Precision', 'Recall'])

# Train the model with validation split
epochs = 5
history = dnn.fit(
    train_features, train_labels,
    validation_split=0.1,  # Using validation split during training
    epochs=epochs,
    verbose=1
)

# Save the model in .keras format
dnn.save('./../artifacts/model.keras')

# Load the model before evaluation
dnn = tf.keras.models.load_model('./../artifacts/model.keras')

# Evaluate on the training data
y_pred_prob = dnn.predict(train_features).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

# Calculate AUROC
auroc = roc_auc_score(train_labels, y_pred_prob)
print(f"Training AUROC: {auroc:.4f}")

# Calculate False Positives
cm = confusion_matrix(train_labels, y_pred)
tn, fp, fn, tp = cm.ravel()
false_positives = fp
print(f"False Positives: {false_positives}")

# Calculate Accuracy
accuracy = accuracy_score(train_labels, y_pred)
print(f"Training Accuracy: {accuracy:.4f}")

# Calculate Precision and Recall
precision = precision_score(train_labels, y_pred)
recall = recall_score(train_labels, y_pred)
print(f"Training Precision: {precision:.4f}")
print(f"Training Recall: {recall:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
3959/3959 ━━━━━━━━━━━━━━━━━━━━ 1:37:49 1s/step - AUC: 0.4417 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.6250 - loss: 0.687 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.7357 - Precision: 0.0000e+00 - Recall: 0.0000e+00 - accuracy: 0.7223 - loss: 0.6455    ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.7831 - Precision: 0.3159 - Recall: 0.0433 - accuracy: 0.7353 - loss: 0.5960        ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.8160 - Precision: 0.5366 - Recall: 0.1344 - accuracy: 0.7613 - loss: 0.549 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.8386 - Precision: 0.6480 - Recall: 0.2108 - accuracy: 0.7827 - loss: 0.509 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.8550 - Precision: 0.7099 - Recall: 0.2718 - accuracy: 0.7989 - loss: 0.477 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.8680 - Precision: 0.7529 - Recall: 0.3227 - accuracy: 0.8126 - loss: 0.450 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - AUC: 0.8785 - Precision: 0.7846 - Recall: 0.3648 - accuracy: 0.8240 - loss: 0.427 ━━━━━━━━━━━━

In [26]:
len(train_features)

140743

In [25]:
# Example usage
malicious_flow = malicious_df.iloc[0]
malicious_flow_features, malicious_flow_meta = flow_to_np(malicious_flow)

benign_flow = benign_df.iloc[0]
benign_flow_features, benign_flow_meta = flow_to_np(benign_flow)

print(benign_flow_meta)
print(malicious_flow_meta)

# import from flow_analysis.py

# Example usage
prediction, prediction_prob = classify_single_flow(malicious_flow_features, dnn, scaler)
print(f"MALICIOUS - Prediction: {prediction}, Probability: {prediction_prob:.4f}")

prediction, prediction_prob = classify_single_flow(benign_flow_features, dnn, scaler)
print(f"BENIGN - Prediction: {prediction}, Probability: {prediction_prob:.4f}")

{'id': '192.168.4.118-125.6.164.51-4922-80-6', 'timestamp': np.float64(1276388681.408579), 'src_ip': '192.168.4.118', 'dst_ip': '125.6.164.51', 'src_port': np.int64(4922), 'dst_port': np.int64(80), 'protocol': np.int64(6), 'termination_reason': 'FIN', 'label': np.int64(0)}
{'id': '192.168.1.105-192.168.5.122-18539-110-6', 'timestamp': np.float64(1276351088.564463), 'src_ip': '192.168.1.105', 'dst_ip': '192.168.5.122', 'src_port': np.int64(18539), 'dst_port': np.int64(110), 'protocol': np.int64(6), 'termination_reason': 'FIN', 'label': np.int64(1)}
MALICIOUS - Prediction: 1, Probability: 0.9934
BENIGN - Prediction: 0, Probability: 0.0002
