In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import os
import joblib # For saving/loading scikit-learn objects
import json # For saving/loading threshold
import warnings

In [2]:
#  Configuration for Saving
save_dir = "trained_models_and_params" #where trained models and parameters saved
os.makedirs(save_dir, exist_ok=True)

In [3]:
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

In [4]:
df = pd.read_csv("processed/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   Dst Port           1048575 non-null  object
 1   Protocol           1048575 non-null  object
 2   Timestamp          1048575 non-null  object
 3   Flow Duration      1048575 non-null  object
 4   Tot Fwd Pkts       1048575 non-null  object
 5   Tot Bwd Pkts       1048575 non-null  object
 6   TotLen Fwd Pkts    1048575 non-null  object
 7   TotLen Bwd Pkts    1048575 non-null  object
 8   Fwd Pkt Len Max    1048575 non-null  object
 9   Fwd Pkt Len Min    1048575 non-null  object
 10  Fwd Pkt Len Mean   1048575 non-null  object
 11  Fwd Pkt Len Std    1048575 non-null  object
 12  Bwd Pkt Len Max    1048575 non-null  object
 13  Bwd Pkt Len Min    1048575 non-null  object
 14  Bwd Pkt Len Mean   1048575 non-null  object
 15  Bwd Pkt Len Std    1048575 non-null  object
 16  

In [6]:
df["Label"].value_counts()

Label
DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Label                            1
Name: count, dtype: int64

In [7]:
df = df[df["Label"] != "Label"]

In [8]:
df["Label"].value_counts()

Label
DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Name: count, dtype: int64

In [9]:
df = df.drop(['Dst Port', 'Protocol', 'Timestamp'], axis=1)
df_y = df['Label'].copy() # Extract the Label column
df_x = df.drop('Label', axis=1).copy() # Extract all other columns as features
df_y_binary = df_y.apply(lambda x: 0 if x == 'Benign' else 1)

In [10]:
# Convert features to numeric
for col in df_x.columns:
    df_x[col] = pd.to_numeric(df_x[col], errors='coerce')

In [11]:
# Handle infinite values
df_x.replace([np.inf, -np.inf], np.nan, inplace=True)

In [12]:
df_x.isnull().sum()

Flow Duration      0
Tot Fwd Pkts       0
Tot Bwd Pkts       0
TotLen Fwd Pkts    0
TotLen Bwd Pkts    0
                  ..
Active Min         0
Idle Mean          0
Idle Std           0
Idle Max           0
Idle Min           0
Length: 76, dtype: int64

In [13]:
# Scale the features
scaler = StandardScaler()
scaler.fit(df_x)
# Save the scaler for later use
joblib.dump(scaler, os.path.join(save_dir, 'fitted_scaler.pkl'))
print(f"StandardScaler fitted and saved to {os.path.join(save_dir, 'fitted_scaler.pkl')}")

# Apply scaling to all training data
X_scaled = scaler.transform(df_x)

StandardScaler fitted and saved to trained_models_and_params\fitted_scaler.pkl


In [14]:
# === Step 1: Prepare benign and attack subsets ===
benign_mask = df_y_binary == 0  # Use '0' for benign
X_normal = X_scaled[benign_mask]

In [15]:
# === Step 2: Build the Autoencoder ===
input_dim = X_normal.shape[1]
input_layer = Input(shape=(input_dim,))
encode_layer_1 = Dense(64, activation='relu')(input_layer)
encode_layer_2 = Dense(32, activation='relu')(encode_layer_1)
bottleneck = Dense(16, activation='relu')(encode_layer_2)

decode_layer_1 = Dense(32, activation='relu')(bottleneck)
decode_layer_2 = Dense(64, activation='relu')(decode_layer_1)
output_layer = Dense(input_dim, activation='linear')(decode_layer_2)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

In [16]:
# === Step 3: Train on benign traffic only ===
autoencoder.fit(
    X_normal, X_normal,
    epochs=50,
    batch_size=64,
    validation_split=0.1,
    shuffle=True,
    verbose=1
)


Epoch 1/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 0.3918 - val_loss: 1.3372
Epoch 2/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 979us/step - loss: 0.2749 - val_loss: 1.1980
Epoch 3/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 948us/step - loss: 0.2587 - val_loss: 1.0182
Epoch 4/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 988us/step - loss: 0.2162 - val_loss: 1.0756
Epoch 5/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.1453 - val_loss: 0.6633
Epoch 6/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 995us/step - loss: 0.1596 - val_loss: 1.2759
Epoch 7/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 998us/step - loss: 0.1427 - val_loss: 1.0448
Epoch 8/50
[1m6283/6283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 998us/step - loss: 0.1977 - val_loss: 0.9952
Epoch 9/50


<keras.src.callbacks.history.History at 0x1f0588c4290>

In [17]:
autoencoder.save(os.path.join(save_dir, 'trained_autoencoder.h5'))
print(f"Autoencoder model trained and saved to {os.path.join(save_dir, 'trained_autoencoder.h5')}")



Autoencoder model trained and saved to trained_models_and_params\trained_autoencoder.h5


In [18]:
# === Step 4: Compute reconstruction error ===
reconstructed_all = autoencoder.predict(X_scaled)
reconstruction_error = np.mean(np.square(X_scaled - reconstructed_all), axis=1)

# Threshold = 95th percentile of benign reconstruction error
benign_recon_error = reconstruction_error[benign_mask]
print(f"Benign Reconstruction Error Statistics:\n{pd.Series(benign_recon_error).describe()}")
threshold = np.percentile(benign_recon_error, 95)

# Binary predictions (1 = anomaly)
predictions = reconstruction_error > threshold

[1m32768/32768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 478us/step
Benign Reconstruction Error Statistics:
count    446772.000000
mean          0.108572
std          20.938884
min           0.000009
25%           0.000047
50%           0.000089
75%           0.000234
max        6069.007420
dtype: float64


In [19]:
with open(os.path.join(save_dir, 'threshold.json'), 'w') as f:
    json.dump({"threshold": threshold}, f)
print(f"Anomaly threshold ({threshold:.6f}) saved to {os.path.join(save_dir, 'threshold.json')}")


Anomaly threshold (0.001812) saved to trained_models_and_params\threshold.json


In [21]:
# === Step 5: Evaluate performance ===

auc = roc_auc_score(df_y_binary, reconstruction_error)
num_anomalies = predictions.sum()
total = len(predictions)

print(f"AUC Score: {auc:.4f}")
print(f"Threshold (95th percentile): {threshold:.6f}")
print(f"Detected Anomalies: {num_anomalies}/{total}")

AUC Score: 0.9809
Threshold (95th percentile): 0.001812
Detected Anomalies: 624141/1048574


In [22]:
# --- Evaluation ---
y_pred_anomaly = (reconstruction_error > threshold).astype(int)

print("\n--- Autoencoder Anomaly Detection Evaluation ---")
print(f"Total Samples: {len(df_y_binary)}")
print(f"True Anomalies: {df_y_binary.sum()} ({df_y_binary.sum()/len(df_y_binary)*100:.2f}%)")
print(f"Predicted Anomalies: { y_pred_anomaly.sum()}")

print("\nClassification Report:")
# Use df_y_binary for comparison, which aligns with X_scaled_full
print(classification_report(df_y_binary, y_pred_anomaly , target_names=['Normal', 'Anomaly']))


--- Autoencoder Anomaly Detection Evaluation ---
Total Samples: 1048574
True Anomalies: 601802 (57.39%)
Predicted Anomalies: 624141

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      0.95      0.97    446772
     Anomaly       0.96      1.00      0.98    601802

    accuracy                           0.98   1048574
   macro avg       0.98      0.97      0.98   1048574
weighted avg       0.98      0.98      0.98   1048574

