<a href="https://colab.research.google.com/github/Vikk-17/sys_traffic_gen/blob/main/optimizedWGan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TODOS:
-   Try with diff params for optimization
-   Optimizers such as ADAM, RMSProp, learning_rate
-   Non-linear actiavations
-   Use less features
-   Test the model with X_test
-   Cross Validation
-   Save the model


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import keras

In [None]:
# Load multiple datasets
def load_datasets(file_paths):
    """
    Takes a list of file paths and returns a combined data frame.
    :params: list of file paths
    :return: combined data frame
    """
    df_list = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        df = df.dropna()
        df.columns = df.columns.str.strip()
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df


def process_data(dataFrame):
    """
    Takes data frame and returns features, labels.
    :params: pandas data frame
    :return: features: numpy array, labels: numpy array
    """
    labels = dataFrame['Label'].copy()
    features = dataFrame.drop(columns=['Label'])

    # Encode labels to binary (0 for normal, 1 for attack)
    # labels = (labels != 'BENIGN').astype(int)
    # Encode labels to integers
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    # Replce any infinity values with NaN
    features = features.replace(to_replace=[-np.inf, np.inf], value=np.nan)

    # Fill NaN values using suitable strategy (e.g., mean or median)
    features = features.fillna(features.mean())

    # Normalize features to a specific range [0, 1] both including
    # x_scaled = (x-x_min) / (x_max-x_min)
    scaler = MinMaxScaler(feature_range=(0, 1))
    features = scaler.fit_transform(features)

    # split data
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # (153000, 78), (33000, 78), (153000,), (33000,)
    return X_train, X_test, y_train, y_test, scaler, label_encoder


In [None]:
# VAE Encoder
class Encoder(tf.keras.Model):
    def __init__(self, latent_dim):
        super(Encoder, self).__init__()

        self.dense1 = tf.keras.layers.Dense(units=128, activation='relu') # non linear activation [0, 0.5]
        self.dense2 = tf.keras.layers.Dense(units=64, activation='relu')

        self.latent_mean = tf.keras.layers.Dense(latent_dim) # linear activation
        self.latent_log_var = tf.keras.layers.Dense(latent_dim)


    # returns the latent space (latent_mean, latent_log_var)
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        mean = self.latent_mean(x)
        log_var = self.latent_log_var(x)
        return mean, log_var


# Decoder
class Decoder(tf.keras.Model):
    def __init__(self, feature_dim):
        super(Decoder, self).__init__()

        self.dense1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=128, activation='relu')
        self.output_layer = tf.keras.layers.Dense(units=feature_dim, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        reconstructed = self.output_layer(x)
        return reconstructed


# Variational Autoencoder
class VariationalAutoencoder(tf.keras.Model):
    def __init__(self, latent_dim, feature_dim):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(feature_dim)

    def call(self, inputs):
        """
        z = u + sigma * epsilon
        where u is the mean and sigma is the log variance
        and epsilon is any value between -1 and 1
        """
        mean, log_var = self.encoder(inputs)
        epsilon = tf.random.normal(shape=tf.shape(mean))
        z = mean + tf.exp(log_var * 0.5) * epsilon
        reconstructed = self.decoder(z)
        return reconstructed, mean, log_var


def pretrain_vae(vae, data, epochs):
    optimizer = tf.keras.optimizers.Adam()

    @tf.function
    def train_step(inputs):
        with tf.GradientTape() as tape:
            reconstructed, mean, log_var = vae(inputs)
            reconstruction_loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(tf.cast(inputs, tf.float32), reconstructed))
            kl_loss = -0.5 * tf.reduce_mean(1 + log_var - tf.square(mean) - tf.exp(log_var))
            loss = reconstruction_loss + kl_loss
        gradients = tape.gradient(loss, vae.trainable_variables)
        optimizer.apply_gradients(zip(gradients, vae.trainable_variables))

        # reconstruction accuracy
        recon_accuracy = tf.reduce_mean(tf.cast(tf.abs(tf.cast(inputs, tf.float32) - reconstructed) < 0.1, tf.float32)) * 100
        return loss, recon_accuracy

    # for epoch in range(epochs):
    #   for i in range(0, len(data), 64):
    #     batch_data = data[i:i+64]
    #     loss = train_step(batch_data)
    #   print(f"Epoch {epoch+1}, VAE Loss: {loss.numpy()}")

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_accuracy = 0

        dataset = tf.data.Dataset.from_tensor_slices(data).batch(64)


        for batch in dataset:
            loss, accuracy = train_step(batch)
            epoch_loss += accuracy / len(dataset)
            epoch_accuracy += accuracy / len(dataset)
        print(f"Epoch {epoch + 1}, VAE Loss: {loss.numpy()}, Reconstructed Accuracy: {epoch_accuracy.numpy()}%")


def test_vae(vae, X_test, scaler):
    # scaler = MinMaxScaler()
    X_test_normalized = scaler.transform(X_test)

    reconstructed, _, _  = vae(X_test_normalized)

    # compute reconsturction loss
    reconstruction_loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(X_test_normalized, reconstructed))

    recon_accuracy = tf.reduce_mean(tf.cast(tf.abs(X_test_normalized - reconstructed)))

    print(f"VAE Reconstruction Loss: {reconstruction_loss.numpy()}")
    print(f"VAE Reconstruction Accuracy: {recon_accuracy.numpy()}%")

In [None]:
# Generator
class WGANGenerator(tf.keras.Model):
    def __init__(self, latent_dim, feature_dim):
        super(WGANGenerator, self).__init__()
        self.vae_decoder = Decoder(feature_dim)

    def call(self, z):
        return self.vae_decoder(z)

# Discriminator
class Discriminator(tf.keras.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units=256, activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=128, activation='relu')
        self.output_layer = tf.keras.layers.Dense(units=1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)


def train_wgan(generator, discriminator, real_data, epochs, batch_size, latent_dim, clip_value=0.01):
    gen_optimizer = tf.keras.optimizers.RMSprop(learning_rate=5e-5)
    disc_optimizer = tf.keras.optimizers.RMSprop(learning_rate=5e-5)

    @tf.function
    def train_step(real_data):
        noise = tf.random.normal([batch_size, latent_dim])

        # Train discriminator
        with tf.GradientTape() as disc_tape:
            fake_data = generator(noise)
            real_output = discriminator(real_data)
            fake_output = discriminator(fake_data)
            disc_loss = -(tf.reduce_mean(real_output) - tf.reduce_mean(fake_output))

        disc_grads = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
        disc_optimizer.apply_gradients(zip(disc_grads, discriminator.trainable_variables))

        # Clip discriminator weights
        for var in discriminator.trainable_variables:
            var.assign(tf.clip_by_value(var, -clip_value, clip_value))

        # Train generator
        with tf.GradientTape() as gen_tape:
            fake_data = generator(noise)
            fake_output = discriminator(fake_data)
            gen_loss = -tf.reduce_mean(fake_output)

        gen_grads = gen_tape.gradient(gen_loss, generator.trainable_variables)
        gen_optimizer.apply_gradients(zip(gen_grads, generator.trainable_variables))

        # Compute generator accuracy (percentage of fake data classified as real)
        fake_predictions = tf.sigmoid(fake_output) > 0.5
        gen_accuracy = tf.reduce_mean(tf.cast(fake_predictions, tf.float32)) * 100

        return gen_loss, disc_loss, gen_accuracy

    for epoch in range(epochs):
        epoch_gen_loss = 0
        epoch_disc_loss = 0
        epoch_gen_accuracy = 0
        num_batches = len(real_data) // batch_size

        for i in range(0, len(real_data), batch_size):
            real_batch = real_data[i:i+batch_size]
            gen_loss, disc_loss, gen_accuracy = train_step(real_batch)
            epoch_gen_loss += gen_loss / num_batches
            epoch_disc_loss += disc_loss / num_batches
            epoch_gen_accuracy += gen_accuracy / num_batches
        print(f"Epoch {epoch+1}, Generator Loss: {gen_loss.numpy()}, Discriminator Loss: {disc_loss.numpy()}, Generator Accuracy: {epoch_gen_accuracy.numpy()}%")


def test_wgan(generator, discriminator, X_test, latent_dim, scaler, batch_size=64):
    # Normalize the test_data
    real_data = scaler.transform(X_test)

    # Generate the fake_data using the generator
    noise = tf.random.normal([len(X_test), latent_dim])
    fake_data = generator(noise)

    # Evaluate the discriminator on real and fake data
    real_output = discriminator(real_data)
    fake_output = discriminator(fake_data)

    # Calculate discriminator accuracy
    real_accuracy = tf.reduce_mean(tf.cast(real_output > 0, tf.float32)) * 100
    fake_accuracy = tf.reduce_mean(tf.cast(fake_output < 0, tf.float32)) * 100

    print(f"Discriminator Accuracy on Real Data: {real_accuracy.numpy()}%")
    print(f"Discriminator Accuracy on Fake Data: {fake_accuracy.numpy()}%")

In [None]:
# Combined VAE + WGAN Model
class VAEWGAN(tf.keras.Model):
    def __init__(self, vae, generator, discriminator):
        super(VAEWGAN, self).__init__()
        self.vae = vae
        self.generator = generator
        self.discriminator = discriminator

    def call(self, inputs):
        # VAE part
        reconstructed, mean, log_var = self.vae(inputs)

        # WGAN Generator part
        noise = tf.random.normal([inputs.shape[0], self.generator.input_shape[-1]])
        synthetic_data = self.generator(noise)

        return {
            'vae_reconstruction': reconstructed,
            'vae_mean': mean,
            'vae_log_var': log_var,
            'synthetic_data': synthetic_data
        }


In [None]:
def generate_synthetic_traffic(combine_df, generator, scaler, label_encoder, latent_dim, num_samples=1000):
    noise = tf.random.normal([num_samples, latent_dim])
    synthetic_data = generator(noise).numpy()
    synthetic_data = scaler.inverse_transform(synthetic_data)

    # Get original column names (excluding 'Label')
    original_columns = combine_df.drop(columns=["Label"]).columns

    # Create DataFrame with original column names
    synthetic_df = pd.DataFrame(synthetic_data, columns=original_columns)

    # Convert specific fields to integers and apply valid ranges
    # synthetic_df['Total Forwarded Packets'] = synthetic_df['Total Forwarded Packets'].astype(int)
    synthetic_df['Destination Port'] = synthetic_df['Destination Port'].astype(int).clip(0, 65535)

    # Assign labels based on noise input
    attack_types = label_encoder.classes_
    synthetic_labels = []
    for _ in range(num_samples):
        attack_type = np.random.choice(attack_types)
        synthetic_labels.append(attack_type)
    synthetic_df['Label'] = synthetic_labels

    synthetic_df.to_csv('synthetic_traffic.csv', index=False)
    print("Synthetic traffic saved to 'synthetic_traffic.csv'")

In [None]:
def main():
    file_paths = [
        "/content/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
        "/content/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
        "/content/Friday-WorkingHours-Morning.pcap_ISCX.csv",
        "/content/Monday-WorkingHours.pcap_ISCX.csv",
        "/content/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
        "/content/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
        "/content/Tuesday-WorkingHours.pcap_ISCX.csv",
        "/content/Wednesday-workingHours.pcap_ISCX.csv",

    ]


    data = load_datasets(file_paths)

    # split data
    X_train, X_test, y_train, y_test, scaler, label_encoder = process_data(data)


    latent_dim = 10
    feature_dim = X_train.shape[1] # 78

    vae = VariationalAutoencoder(latent_dim, feature_dim)
    pretrain_vae(vae, X_train, epochs=20)

    # Test vae
    # test_vae(vae, X_test, scaler)

    generator = WGANGenerator(latent_dim, feature_dim)
    discriminator = Discriminator()

    train_wgan(generator, discriminator, X_train, epochs=50, batch_size=64, latent_dim=latent_dim)


    # Create combined model
    combined_model = VAEWGAN(vae, generator, discriminator)

    # Save the combined model
    combined_model.save('vae_wgan_model.keras')
    print("Combined model saved as 'vae_wgan_model.keras'.")


    # Test WGAN
    # test_wgan(generator, discriminator, X_test, scaler, latent_dim)

    # Generate synthetic traffic
    # noise = tf.random.normal([1000, latent_dim])
    # synthetic_data = generator(noise).numpy()

    # synthetic_data = scaler.inverse_transform(synthetic_data)
    # pd.DataFrame(synthetic_data, columns=data.drop(columns=["Label"]).columns).to_csv("Synthetic_traffic.csv", index=False)

    # print("Synthetic traffic saved to 'synthetic_traffic.csv'")

    # generate_synthetic_traffic(data, generator, scaler, label_encoder, latent_dim, num_samples=1000)


    # Save the model for further use
    # synthetic_data.save('synthetic_traffic.keras')

if __name__ == "__main__":
    main()

Epoch 1, VAE Loss: 0.154277965426445, Reconstructed Accuracy: 89.58180236816406%
Epoch 2, VAE Loss: 0.15425603091716766, Reconstructed Accuracy: 89.75859069824219%
Epoch 3, VAE Loss: 0.15423071384429932, Reconstructed Accuracy: 89.78211975097656%
Epoch 4, VAE Loss: 0.15419530868530273, Reconstructed Accuracy: 89.79608154296875%
Epoch 5, VAE Loss: 0.1541604846715927, Reconstructed Accuracy: 89.803955078125%
Epoch 6, VAE Loss: 0.15413358807563782, Reconstructed Accuracy: 89.81421661376953%
Epoch 7, VAE Loss: 0.15411099791526794, Reconstructed Accuracy: 89.81996154785156%
Epoch 8, VAE Loss: 0.15410666167736053, Reconstructed Accuracy: 89.82266998291016%
Epoch 9, VAE Loss: 0.15410347282886505, Reconstructed Accuracy: 89.83076477050781%
Epoch 10, VAE Loss: 0.15409192442893982, Reconstructed Accuracy: 89.83351135253906%
Epoch 11, VAE Loss: 0.15409578382968903, Reconstructed Accuracy: 89.83770751953125%
Epoch 12, VAE Loss: 0.15409541130065918, Reconstructed Accuracy: 89.84056854248047%
Epoch 