<a href="https://colab.research.google.com/github/Vikk-17/sys_traffic_gen/blob/main/synthetic_traffic_vae_wgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Preprocessing function with get_dummies
scaler = None

def preprocess_csv_with_dummies(file_path):
    global scaler
    df = pd.read_csv(file_path)

    # Detect numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Normalize numerical columns
    if numerical_columns:
        scaler = MinMaxScaler()
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # One-hot encode categorical columns using pd.get_dummies
    if categorical_columns:
        df = pd.get_dummies(df, columns=categorical_columns)

    # Convert the DataFrame to float32 for compatibility with TensorFlow
    return df.astype(np.float32), numerical_columns, categorical_columns

# Load and preprocess data
file_path = '/content/synthetic_cicids_dataset_detailed.csv'

processed_data, numerical_columns, categorical_columns = preprocess_csv_with_dummies(file_path)

# Convert to NumPy array
data = processed_data.values

X_train, X_temp = train_test_split(data, test_size=0.4, random_state=42)
X_valid, X_test = train_test_split(data, test_size=0.5, random_state=42)

# Define Variational Autoencoder components
class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(tf.keras.Model):
    def __init__(self, original_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = self.build_encoder(original_dim, latent_dim)
        self.decoder = self.build_decoder(original_dim, latent_dim)
        self.latent_dim = latent_dim

    def build_encoder(self, original_dim, latent_dim):
        inputs = tf.keras.layers.Input(shape=(original_dim,))
        x = tf.keras.layers.Dense(128, activation="relu")(inputs)
        x = tf.keras.layers.Dense(64, activation="relu")(x)
        z_mean = tf.keras.layers.Dense(latent_dim)(x)
        z_log_var = tf.keras.layers.Dense(latent_dim)(x)
        z = Sampling()([z_mean, z_log_var])
        return tf.keras.Model(inputs, [z_mean, z_log_var, z], name="encoder")

    def build_decoder(self, original_dim, latent_dim):
        latent_inputs = tf.keras.layers.Input(shape=(latent_dim,))
        x = tf.keras.layers.Dense(64, activation="relu")(latent_inputs)
        x = tf.keras.layers.Dense(128, activation="relu")(x)
        outputs = tf.keras.layers.Dense(original_dim, activation="sigmoid")(x)
        return tf.keras.Model(latent_inputs, outputs, name="decoder")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = tf.keras.losses.mse(inputs, reconstructed)
        reconstruction_loss *= tf.cast(tf.shape(inputs)[1], tf.float32)
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        self.add_loss(tf.reduce_mean(reconstruction_loss + kl_loss))
        return reconstructed


# Define WGAN components
class Generator(tf.keras.Model):
    def __init__(self, data_dim):
        super(Generator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(128, activation='relu')
        self.dense2 = tf.keras.layers.Dense(256, activation='relu')
        self.dense3 = tf.keras.layers.Dense(data_dim, activation='tanh')


    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)


class Discriminator(tf.keras.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.dense3 = tf.keras.layers.Dense(1)


    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)


# WGAN training loop
def train_wgan(generator, discriminator, data, latent_dim, batch_size=64, epochs=100, learning_rate=0.0001):
    # Optimizers
    gen_optimizer = tf.keras.optimizers.Adam(learning_rate)
    disc_optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Training step
    for epoch in range(epochs):
        for i in range(0, data.shape[0], batch_size):
            real_data = data[i:i + batch_size]
            batch_size_real = real_data.shape[0]

            # Train Discriminator
            with tf.GradientTape() as disc_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                real_output = discriminator(real_data, training=True)
                fake_output = discriminator(fake_data, training=True)

                disc_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

            grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
            disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_variables))

            # Train Generator
            with tf.GradientTape() as gen_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                fake_output = discriminator(fake_data, training=True)

                gen_loss = -tf.reduce_mean(fake_output)

            grads_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
            gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_variables))

        # Logging
        print(f"Epoch {epoch + 1}/{epochs}, Generator Loss: {gen_loss.numpy()}, Discriminator Loss: {disc_loss.numpy()}")


# Initialize generator, discriminator, and VAE
original_dim = data.shape[1]
latent_dim = 10
vae_latent_dim = 5

vae = VAE(original_dim, vae_latent_dim)
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=[tf.keras.losses.MeanSquaredError()])
vae.fit(X_train, X_train, validation_data=(X_valid, X_valid), epochs=50, batch_size=64)

generator = Generator(original_dim)
discriminator = Discriminator()

# Train WGAN
train_wgan(generator, discriminator, X_train, latent_dim)


# Generate synthetic data
def generate_synthetic_data(generator, num_samples, latent_dim):
    z = tf.random.normal((num_samples, latent_dim))
    synthetic_data = generator(z, training=False).numpy()
    return synthetic_data

synthetic_samples = generate_synthetic_data(generator, num_samples=5000, latent_dim=latent_dim)


# Transform synthetic data back to match original input
def postprocess_synthetic_data(synthetic_data, original_df, numerical_columns, categorical_columns):
    global scaler
    df = pd.DataFrame(synthetic_data, columns=original_df.columns)

    # columns_to_convert = ["Packet Count", "Byte Count", "Flow Duration (ms)", "Idle Time (ms)", "Active Time (ms)"]

    # for column in columns_to_convert:
    #     if column in numerical_columns:
    #         df[column] = df[column].round().astype(int)

    # Denormalize numerical columns
    if numerical_columns and scaler:
        df[numerical_columns] = scaler.inverse_transform(df[numerical_columns])

        # Clip and round ports to valid range
        if 'Source Port' in numerical_columns:
            df['Source Port'] = df['Source Port'].clip(0, 65535).round().astype(int)
        if 'Destination Port' in numerical_columns:
            df['Destination Port'] = df['Destination Port'].clip(0, 65535).round().astype(int)


        if "Packet Count" in numerical_columns:
            df['Packet Count'] = df['Packet Count'].round().astype(int)
        if "Byte Count" in numerical_columns:
            df['Byte Count'] = df['Byte Count'].round().astype(int)
        if "Flow Duration (ms)" in numerical_columns:
            df['Flow Duration (ms)'] = df['Flow Duration (ms)'].round().astype(int)
        if "Idle Time (ms)" in numerical_columns:
            df['Idle Time (ms)'] = df['Idle Time (ms)'].round().astype(int)

        if "Active Time (ms)" in numerical_columns:
            df['Active Time (ms)'] = df['Active Time (ms)'].round().astype(int)


    # Convert one-hot encoded columns back to original categories
    for cat_col in categorical_columns:
        cat_prefix = [col for col in df.columns if col.startswith(cat_col + '_')]
        if cat_prefix:
            df[cat_col] = df[cat_prefix].idxmax(axis=1).apply(lambda x: x.split('_', 1)[-1])
            df = df.drop(columns=cat_prefix)


    if 'Attack Type' in df.columns:
        df['Label'] = df['Attack Type'].apply(lambda x: 'Benign' if x.lower() in ['normal', 'benign'] else 'Malicious')


    df['Source Port'] = df['Source Port'].fillna(0).astype(int)
    df['Destination Port'] = df['Destination Port'].fillna(0).astype(int)

    return df

synthetic_data_df = postprocess_synthetic_data(synthetic_samples, processed_data, numerical_columns, categorical_columns)

# Save to CSV
synthetic_data_df.to_csv('synthetic_traffic_vae_wgan.csv', index=False)
print("Postprocessed synthetic data saved to 'synthetic_traffic_vae_wgan.csv'.")


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - loss: 749.4598 - mean_squared_error: 0.2488 - val_loss: 742.3734 - val_mean_squared_error: 0.2465
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 739.8483 - mean_squared_error: 0.2456 - val_loss: 731.0573 - val_mean_squared_error: 0.2427
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 728.8547 - mean_squared_error: 0.2419 - val_loss: 718.7036 - val_mean_squared_error: 0.2386
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 715.2525 - mean_squared_error: 0.2374 - val_loss: 703.3266 - val_mean_squared_error: 0.2335
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 699.1729 - mean_squared_error: 0.2321 - val_loss: 683.3559 - val_mean_squared_error: 0.2269
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 