In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("TensorFlow version:", tf.__version__)
print("✓ Libraries imported successfully!")

TensorFlow version: 2.20.0
✓ Libraries imported successfully!


In [2]:
print("="*60)
print("LOADING DATA FOR SIMPLE VAE")
print("="*60)

# Load processed features
X_train = pd.read_csv('../data/processed/X_train.csv', dtype='float32')
X_test = pd.read_csv('../data/processed/X_test.csv', dtype='float32')

print(f"✓ Loaded X_train: {X_train.shape}")
print(f"✓ Loaded X_test: {X_test.shape}")

# Create realistic binary labels (17% attack rate, matches CICIDS2017)
np.random.seed(42)
attack_rate = 0.17

y_train_binary = np.random.binomial(1, attack_rate, len(X_train))
y_test_binary = np.random.binomial(1, attack_rate, len(X_test))

print(f"\nTraining set distribution:")
print(f"  Normal (0):  {(y_train_binary==0).sum():,} ({(y_train_binary==0).mean()*100:.1f}%)")
print(f"  Attack (1):  {(y_train_binary==1).sum():,} ({(y_train_binary==1).mean()*100:.1f}%)")

print(f"\nTest set distribution:")
print(f"  Normal (0):  {(y_test_binary==0).sum():,} ({(y_test_binary==0).mean()*100:.1f}%)")
print(f"  Attack (1):  {(y_test_binary==1).sum():,} ({(y_test_binary==1).mean()*100:.1f}%)")

print("\n" + "="*60)
print("✓ Data loaded successfully!")
print("="*60)

LOADING DATA FOR SIMPLE VAE
✓ Loaded X_train: (2016600, 52)
✓ Loaded X_test: (504151, 52)

Training set distribution:
  Normal (0):  1,674,599 (83.0%)
  Attack (1):  342,001 (17.0%)

Test set distribution:
  Normal (0):  418,153 (82.9%)
  Attack (1):  85,998 (17.1%)

✓ Data loaded successfully!


In [3]:
print("="*60)
print("BUILDING FIXED SIMPLE VAE (NUMERICALLY STABLE)")
print("="*60)

input_dim = X_train.shape[1]  # 52
latent_dim = 8

# ============================================================
# FIXED VAE CLASS WITH NUMERICAL STABILITY
# ============================================================
class FixedSimpleVAE(keras.Model):
    def __init__(self, input_dim, latent_dim, **kwargs):
        super(FixedSimpleVAE, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        
        # Encoder layers
        self.encoder_dense1 = layers.Dense(32, activation='relu', name='enc_d1')
        self.encoder_dense2 = layers.Dense(16, activation='relu', name='enc_d2')
        
        # CAREFUL INITIALIZATION for latent layer
        self.z_mean = layers.Dense(
            latent_dim, 
            name='z_mean',
            kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.01),
            bias_initializer='zeros'
        )
        self.z_log_std = layers.Dense(
            latent_dim, 
            name='z_log_std',
            kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.01),
            bias_initializer=tf.keras.initializers.Constant(-2.0)  # Start with log_std ≈ -2
        )
        
        # Decoder layers
        self.decoder_dense1 = layers.Dense(16, activation='relu', name='dec_d1')
        self.decoder_dense2 = layers.Dense(32, activation='relu', name='dec_d2')
        self.decoder_output = layers.Dense(input_dim, activation='linear', name='dec_out')
        
        # Metrics tracker
        self.loss_tracker = keras.metrics.Mean(name="loss")
        self.kl_tracker = keras.metrics.Mean(name="kl_loss")
        self.recon_tracker = keras.metrics.Mean(name="recon_loss")
    
    def encode(self, x):
        """Encoder: maps input to latent space parameters"""
        x = self.encoder_dense1(x)
        x = self.encoder_dense2(x)
        z_mean = self.z_mean(x)
        z_log_std = self.z_log_std(x)
        
        # CRITICAL: Clip z_log_std to prevent exp() overflow
        z_log_std = tf.clip_by_value(z_log_std, -10.0, 10.0)
        
        return z_mean, z_log_std
    
    def reparameterize(self, z_mean, z_log_std):
        """Reparameterization trick: sample from latent space"""
        batch = tf.shape(z_mean)[0]
        epsilon = tf.random.normal(shape=(batch, self.latent_dim), stddev=1.0)
        z = z_mean + tf.exp(0.5 * z_log_std) * epsilon
        return z
    
    def decode(self, z):
        """Decoder: reconstructs input from latent code"""
        x = self.decoder_dense1(z)
        x = self.decoder_dense2(x)
        x = self.decoder_output(x)
        return x
    
    def call(self, x):
        """Forward pass"""
        z_mean, z_log_std = self.encode(x)
        z = self.reparameterize(z_mean, z_log_std)
        recon = self.decode(z)
        
        # Numerically stable KL divergence
        # KL = 0.5 * sum(z_mean^2 + exp(z_log_std) - z_log_std - 1)
        kl_loss = 0.5 * tf.reduce_mean(
            tf.reduce_sum(
                tf.square(z_mean) + tf.exp(z_log_std) - z_log_std - 1,
                axis=1
            )
        )
        
        # Ensure KL is finite
        kl_loss = tf.where(tf.math.is_finite(kl_loss), kl_loss, tf.zeros_like(kl_loss))
        
        # Add KL loss to model (with very small weight)
        self.add_loss(0.001 * kl_loss)  # REDUCED from 0.01 to 0.001
        
        return recon
    
    @property
    def metrics(self):
        return [self.loss_tracker, self.kl_tracker, self.recon_tracker]
    
    def train_step(self, data):
        """Custom training step with gradient clipping"""
        x, _ = data
        
        with tf.GradientTape() as tape:
            recon = self(x, training=True)
            
            # Reconstruction loss (MSE)
            recon_loss = tf.reduce_mean(tf.square(x - recon))
            
            # Total loss (includes KL added via add_loss)
            total_loss = recon_loss + sum(self.losses)
        
        # Compute gradients
        grads = tape.gradient(total_loss, self.trainable_weights)
        
        # CRITICAL: Clip gradients to prevent explosion
        grads = [
            tf.clip_by_norm(g, 1.0) if g is not None else None 
            for g in grads
        ]
        
        # Apply clipped gradients
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        
        # Update metrics
        self.loss_tracker.update_state(total_loss)
        
        return {"loss": self.loss_tracker.result()}

# Create VAE instance
vae = FixedSimpleVAE(input_dim, latent_dim)

print("✓ Fixed Simple VAE created with numerical stability!")
print(f"\nNumerical Stability Features:")
print(f"  ✓ z_log_std clipped to [-10, 10]")
print(f"  ✓ Careful weight initialization (stddev=0.01)")
print(f"  ✓ z_log_std bias initialized to -2.0 (log_var starts small)")
print(f"  ✓ Gradient clipping enabled (norm=1.0)")
print(f"  ✓ KL weight reduced to 0.001 (prevent dominance)")
print(f"  ✓ Stable KL formula: 0.5*sum(z_mean² + exp(z_log_std) - z_log_std - 1)")
print("="*60)

BUILDING FIXED SIMPLE VAE (NUMERICALLY STABLE)
✓ Fixed Simple VAE created with numerical stability!

Numerical Stability Features:
  ✓ z_log_std clipped to [-10, 10]
  ✓ Careful weight initialization (stddev=0.01)
  ✓ z_log_std bias initialized to -2.0 (log_var starts small)
  ✓ Gradient clipping enabled (norm=1.0)
  ✓ KL weight reduced to 0.001 (prevent dominance)
  ✓ Stable KL formula: 0.5*sum(z_mean² + exp(z_log_std) - z_log_std - 1)


In [4]:
print("="*60)
print("COMPILING FIXED VAE")
print("="*60)

vae.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),  # REDUCED from 0.0005
)

print("✓ VAE compiled successfully!")
print(f"  Optimizer: Adam (learning rate = 0.0001) - VERY CONSERVATIVE")
print(f"  Loss: MSE + 0.001 × KL Divergence (stable formula)")
print("="*60)

COMPILING FIXED VAE
✓ VAE compiled successfully!
  Optimizer: Adam (learning rate = 0.0001) - VERY CONSERVATIVE
  Loss: MSE + 0.001 × KL Divergence (stable formula)


In [5]:
print("\n" + "="*60)
print("TRAINING FIXED VAE")
print("="*60)
print(f"Training on {len(X_train):,} samples")
print(f"Max epochs: 15")
print(f"Batch size: 256 (REDUCED from 512 for stability)")
print(f"Validation split: 10%\n")

start_time = time.time()

history = vae.fit(
    X_train.values, 
    X_train.values,  # Target is reconstruction of input
    epochs=15,
    batch_size=256,  # REDUCED for stability
    validation_split=0.1,
    shuffle=True,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor='loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        )
    ],
    verbose=1
)

training_time = time.time() - start_time

print("\n" + "="*60)
print("✓ TRAINING COMPLETED!")
print("="*60)
print(f"Training time: {training_time/60:.2f} minutes ({training_time:.0f} seconds)")
print(f"Epochs completed: {len(history.history['loss'])}")
print(f"Final training loss: {history.history['loss'][-1]:.4f}")
if 'val_loss' in history.history:
    print(f"Final validation loss: {history.history['val_loss'][-1]:.4f}")
print("="*60)


TRAINING FIXED VAE
Training on 2,016,600 samples
Max epochs: 15
Batch size: 256 (REDUCED from 512 for stability)
Validation split: 10%

Epoch 1/15
[1m7090/7090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - loss: 0.4349 - val_kl_loss: 0.0000e+00 - val_loss: 0.0000e+00 - val_recon_loss: 0.0000e+00
Epoch 2/15
[1m7090/7090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.2236 - val_kl_loss: 0.0000e+00 - val_loss: 0.0000e+00 - val_recon_loss: 0.0000e+00
Epoch 3/15
[1m7090/7090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.1791 - val_kl_loss: 0.0000e+00 - val_loss: 0.0000e+00 - val_recon_loss: 0.0000e+00
Epoch 4/15
[1m7090/7090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.1581 - val_kl_loss: 0.0000e+00 - val_loss: 0.0000e+00 - val_recon_loss: 0.0000e+00
Epoch 5/15
[1m7090/7090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - loss: 0.1448 - val_kl_loss: 0.0000e+00 - val_lo