<a href="https://colab.research.google.com/github/Vikk-17/sys_traffic_gen/blob/main/synthetic_traffic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Lambda
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras import backend as K

# Preprocessing function with get_dummies
scaler = None  # Global scaler to maintain consistency

def preprocess_csv_with_dummies(file_path):
    global scaler
    df = pd.read_csv(file_path)

    # Detect numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Normalize numerical columns
    if numerical_columns:
        scaler = MinMaxScaler()
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # One-hot encode categorical columns using pd.get_dummies
    if categorical_columns:
        df = pd.get_dummies(df, columns=categorical_columns)

    # Convert the DataFrame to float32 for compatibility with TensorFlow
    return df.astype(np.float32), numerical_columns, categorical_columns

# Load and preprocess data
file_path = '/content/synthetic_cicids_dataset_detailed.csv'  # Replace with your CSV file path
processed_data, numerical_columns, categorical_columns = preprocess_csv_with_dummies(file_path)

# Convert to NumPy array
data = processed_data.values

X_train, X_temp = train_test_split(data, test_size=0.4, random_state=42)
X_valid, X_test = train_test_split(data, test_size=0.5, random_state=42)

# Define Variational Autoencoder components
class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(Model):
    def __init__(self, original_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = self.build_encoder(original_dim, latent_dim)
        self.decoder = self.build_decoder(original_dim, latent_dim)
        self.latent_dim = latent_dim

    def build_encoder(self, original_dim, latent_dim):
        inputs = Input(shape=(original_dim,))
        x = Dense(128, activation="relu")(inputs)
        x = Dense(64, activation="relu")(x)
        z_mean = Dense(latent_dim)(x)
        z_log_var = Dense(latent_dim)(x)
        z = Sampling()([z_mean, z_log_var])
        return Model(inputs, [z_mean, z_log_var, z], name="encoder")

    def build_decoder(self, original_dim, latent_dim):
        latent_inputs = Input(shape=(latent_dim,))
        x = Dense(64, activation="relu")(latent_inputs)
        x = Dense(128, activation="relu")(x)
        outputs = Dense(original_dim, activation="sigmoid")(x)
        return Model(latent_inputs, outputs, name="decoder")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = tf.keras.losses.mse(inputs, reconstructed)
        reconstruction_loss *= tf.cast(tf.shape(inputs)[1], tf.float32)
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        self.add_loss(tf.reduce_mean(reconstruction_loss + kl_loss))
        return reconstructed

# Define WGAN components
class Generator(tf.keras.Model):
    def __init__(self, data_dim):
        super(Generator, self).__init__()
        self.dense1 = Dense(128, activation='relu')
        self.dense2 = Dense(256, activation='relu')
        self.dense3 = Dense(data_dim, activation='tanh')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

class Discriminator(Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = Dense(256, activation='relu')
        self.dense2 = Dense(128, activation='relu')
        self.dense3 = Dense(1)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

# WGAN training loop
def train_wgan(generator, discriminator, data, latent_dim, batch_size=64, epochs=100, learning_rate=0.0001):
    # Optimizers
    gen_optimizer = Adam(learning_rate)
    disc_optimizer = Adam(learning_rate)

    # Training step
    for epoch in range(epochs):
        for i in range(0, data.shape[0], batch_size):
            real_data = data[i:i + batch_size]
            batch_size_real = real_data.shape[0]

            # Train Discriminator
            with tf.GradientTape() as disc_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                real_output = discriminator(real_data, training=True)
                fake_output = discriminator(fake_data, training=True)

                disc_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

            grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
            disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_variables))

            # Train Generator
            with tf.GradientTape() as gen_tape:
                z = tf.random.normal((batch_size_real, latent_dim))
                fake_data = generator(z, training=True)
                fake_output = discriminator(fake_data, training=True)

                gen_loss = -tf.reduce_mean(fake_output)

            grads_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
            gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_variables))

        # Logging
        print(f"Epoch {epoch + 1}/{epochs}, Generator Loss: {gen_loss.numpy()}, Discriminator Loss: {disc_loss.numpy()}")

# Initialize generator, discriminator, and VAE
original_dim = data.shape[1]
latent_dim = 10
vae_latent_dim = 5

vae = VAE(original_dim, vae_latent_dim)
vae.compile(optimizer=Adam(learning_rate=0.0001), metrics=[tf.keras.losses.MeanSquaredError()])
vae.fit(X_train, X_train, validation_data=(X_valid, X_valid), epochs=50, batch_size=64)

generator = Generator(original_dim)
discriminator = Discriminator()

# Train WGAN
train_wgan(generator, discriminator, X_train, latent_dim)

# Generate synthetic data
def generate_synthetic_data(generator, num_samples, latent_dim):
    z = tf.random.normal((num_samples, latent_dim))
    synthetic_data = generator(z, training=False).numpy()
    return synthetic_data

synthetic_samples = generate_synthetic_data(generator, num_samples=1000, latent_dim=latent_dim)

# Transform synthetic data back to match original input

def postprocess_synthetic_data(synthetic_data, original_df, numerical_columns, categorical_columns):
    global scaler
    df = pd.DataFrame(synthetic_data, columns=original_df.columns)

    # Denormalize numerical columns
    if numerical_columns and scaler:
        df[numerical_columns] = scaler.inverse_transform(df[numerical_columns])

        # Clip and round ports to valid range
        if 'Source Port' in numerical_columns:
            df['Source Port'] = df['Source Port'].clip(0, 65535).round().astype(int)
        if 'Destination Port' in numerical_columns:
            df['Destination Port'] = df['Destination Port'].clip(0, 65535).round().astype(int)

    # Convert one-hot encoded columns back to original categories
    for cat_col in categorical_columns:
        cat_prefix = [col for col in df.columns if col.startswith(cat_col + '_')]
        if cat_prefix:
            df[cat_col] = df[cat_prefix].idxmax(axis=1).apply(lambda x: x.split('_', 1)[-1])
            df = df.drop(columns=cat_prefix)

    # Mimic original labels based on given traffic types (Benign or Malicious)
    # if 'Label' in df.columns and 'Attack_Type' in df.columns:
    #     df['Label'] = df['Attack_Type'].apply(lambda x: 'Benign' if x.lower() == 'normal' else 'Malicious')

    if 'Attack Type' in df.columns:
        df['Label'] = df['Attack Type'].apply(lambda x: 'Benign' if x.lower() in ['normal', 'benign'] else 'Malicious')


    # Additional corrections
    df['Source Port'] = df['Source Port'].fillna(0).astype(int)
    df['Destination Port'] = df['Destination Port'].fillna(0).astype(int)

    return df

synthetic_data_df = postprocess_synthetic_data(synthetic_samples, processed_data, numerical_columns, categorical_columns)

# Save to CSV
synthetic_data_df.to_csv('synthetic_traffic_vae.csv', index=False)
print("Postprocessed synthetic data saved to 'synthetic_traffic_vae.csv'.")


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 73ms/step - loss: 749.9640 - mean_squared_error: 0.2490 - val_loss: 742.9573 - val_mean_squared_error: 0.2467
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 740.7565 - mean_squared_error: 0.2459 - val_loss: 733.2805 - val_mean_squared_error: 0.2435
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 730.5618 - mean_squared_error: 0.2425 - val_loss: 721.4537 - val_mean_squared_error: 0.2395
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 716.8420 - mean_squared_error: 0.2380 - val_loss: 705.8759 - val_mean_squared_error: 0.2343
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 700.8282 - mean_squared_error: 0.2326 - val_loss: 687.7731 - val_mean_squared_error: 0.2283
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 