<a href="https://colab.research.google.com/github/Vikk-17/sys_traffic_gen/blob/main/integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import datetime
from datetime import timedelta

# Global variables
scaler = None
label_encoders = {}

def preprocess_csv_with_dummies(file_path, is_training=True):
    global scaler, label_encoders
    df = pd.read_csv(file_path)

    # Detect numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Handle categorical columns with LabelEncoder instead of one-hot encoding
    for col in categorical_columns:
        if is_training:
            label_encoders[col] = LabelEncoder()
            df[col] = label_encoders[col].fit_transform(df[col])
        else:
            df[col] = label_encoders[col].transform(df[col])

    # Normalize numerical columns
    if numerical_columns:
        if is_training:
            scaler = MinMaxScaler()
            df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
        else:
            df[numerical_columns] = scaler.transform(df[numerical_columns])

    return df.astype(np.float32), numerical_columns, categorical_columns

class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(tf.keras.Model):
    def __init__(self, original_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = self.build_encoder(original_dim, latent_dim)
        self.decoder = self.build_decoder(original_dim, latent_dim)
        self.latent_dim = latent_dim

    def build_encoder(self, original_dim, latent_dim):
        inputs = tf.keras.layers.Input(shape=(original_dim,))
        x = tf.keras.layers.Dense(256, activation="relu")(inputs)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(128, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        z_mean = tf.keras.layers.Dense(latent_dim)(x)
        z_log_var = tf.keras.layers.Dense(latent_dim)(x)
        z = Sampling()([z_mean, z_log_var])
        return tf.keras.Model(inputs, [z_mean, z_log_var, z], name="encoder")

    def build_decoder(self, original_dim, latent_dim):
        latent_inputs = tf.keras.layers.Input(shape=(latent_dim,))
        x = tf.keras.layers.Dense(128, activation="relu")(latent_inputs)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(256, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        outputs = tf.keras.layers.Dense(original_dim, activation="sigmoid")(x)
        return tf.keras.Model(latent_inputs, outputs, name="decoder")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = tf.keras.losses.mse(inputs, reconstructed)
        reconstruction_loss *= tf.cast(tf.shape(inputs)[1], tf.float32)
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        self.add_loss(tf.reduce_mean(reconstruction_loss + kl_loss))
        return reconstructed

class Generator(tf.keras.Model):
    def __init__(self, data_dim):
        super(Generator, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(data_dim, activation='tanh')
        ])

    def call(self, inputs):
        return self.model(inputs)

class Discriminator(tf.keras.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        return self.model(inputs)

class Classifier(tf.keras.Model):
    def __init__(self, input_dim, num_classes):
        super(Classifier, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(num_classes, activation='softmax')
        ])

    def call(self, inputs):
        return self.model(inputs)

def wasserstein_loss(y_true, y_pred):
    return tf.reduce_mean(y_true * y_pred)

def gradient_penalty(discriminator, real_samples, fake_samples):
    alpha = tf.random.uniform([tf.shape(real_samples)[0], 1], 0.0, 1.0)
    interpolated = alpha * real_samples + (1 - alpha) * fake_samples

    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        predictions = discriminator(interpolated)

    gradients = tape.gradient(predictions, interpolated)
    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1))
    gradient_penalty = tf.reduce_mean(tf.square(slopes - 1.0))
    return gradient_penalty

def train_wgan(generator, discriminator, data, latent_dim, batch_size=64, epochs=100, n_critic=5, gp_weight=10.0):
    gen_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)
    disc_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)

    for epoch in range(epochs):
        for _ in range(n_critic):
            # Train Discriminator
            with tf.GradientTape() as disc_tape:
                batch_indices = np.random.randint(0, data.shape[0], batch_size)
                real_data = data[batch_indices]

                z = tf.random.normal((batch_size, latent_dim))
                fake_data = generator(z, training=True)

                real_output = discriminator(real_data, training=True)
                fake_output = discriminator(fake_data, training=True)

                gp = gradient_penalty(discriminator, real_data, fake_data)
                disc_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output) + gp_weight * gp

            grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
            disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_variables))

        # Train Generator
        with tf.GradientTape() as gen_tape:
            z = tf.random.normal((batch_size, latent_dim))
            fake_data = generator(z, training=True)
            fake_output = discriminator(fake_data, training=True)
            gen_loss = -tf.reduce_mean(fake_output)

        grads_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
        gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_variables))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Generator Loss: {gen_loss:.4f}")
            print(f"Discriminator Loss: {disc_loss:.4f}")

def train_classifier(classifier, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    classifier.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5,
        restore_best_weights=True
    )

    history = classifier.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping]
    )

    return history

def predict_traffic(classifier, data, numerical_columns, categorical_columns):
    """
    Predict traffic classification for new data
    """
    processed_data, _, _ = preprocess_csv_with_dummies(data, is_training=False)
    predictions = classifier.predict(processed_data)
    predicted_classes = np.argmax(predictions, axis=1)

    # Convert numeric predictions back to original labels
    original_labels = label_encoders['classification'].inverse_transform(predicted_classes)

    return original_labels



def denormalize_data(synthetic_data, original_df, numerical_columns, categorical_columns, label_encoders, scaler):
    """
    Denormalize synthetic data to match original data format and ranges
    """
    df = pd.DataFrame(synthetic_data, columns=original_df.columns)

    # Denormalize numerical columns
    if numerical_columns and scaler:
        df[numerical_columns] = scaler.inverse_transform(df[numerical_columns])

        # Ensure priority remains in the range [1,5]
        if 'priority' in df.columns:
            df['priority'] = np.round(df['priority']).astype(int).clip(1, 5)

        # Ensure port numbers are valid
        for col in ['src_port', 'dst_port']:
            if col in df.columns:
                df[col] = np.round(df[col]).astype(int).clip(0, 65535)

    # Convert categorical columns back to original categories
    # Exclude timestamp from this process
    categorical_columns_excluding_timestamp = [col for col in categorical_columns if col != 'timestamp']
    for col in categorical_columns_excluding_timestamp:
        if col in label_encoders:
            try:
                df[col] = label_encoders[col].inverse_transform(df[col].astype(int))
            except ValueError:
                # If casting to int fails, handle the column differently (e.g., skip or use a custom transformation)
                print(f"Warning: Skipping inverse transform for column '{col}' due to ValueError.")

    # Ensure timestamp format is maintained (if 'timestamp' in df.columns)
    if 'timestamp' in df.columns:
        #df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d-%H:%M:%S.%f') #Commented out for now because it may be encoded.

        #If the timestamp is already in a string format, just ensure the format remains the same
        if df['timestamp'].dtype == object:
            pass
        else: #If the timestamp is encoded, then we decode it.
            df['timestamp'] = label_encoders['timestamp'].inverse_transform(df['timestamp'].astype(int))

    # Validate protocol values
    if 'protocol' in df.columns:
        valid_protocols = ['TCP', 'UDP', 'ICMP']
        df['protocol'] = df['protocol'].apply(lambda x: x if x in valid_protocols else np.random.choice(valid_protocols))

    # Preserve src_ip and dst_ip as is
    if 'src_ip' in df.columns and 'dst_ip' in df.columns:
        df['src_ip'] = original_df['src_ip'].values[:len(df)]
        df['dst_ip'] = original_df['dst_ip'].values[:len(df)]

    # Ensure classification and alert retain valid values
    valid_classifications = original_df['classification'].unique().tolist()
    if 'classification' in df.columns:
        df['classification'] = df['classification'].apply(lambda x: x if x in valid_classifications else np.random.choice(valid_classifications))

    valid_alerts = original_df['alert'].unique().tolist()
    if 'alert' in df.columns:
        df['alert'] = df['alert'].apply(lambda x: x if x in valid_alerts else np.random.choice(valid_alerts))

    return df


def generate_synthetic_data(num_samples, original_df):
    """
    Generate synthetic network traffic data similar to the original dataset
    """
    synthetic_data = pd.DataFrame()

    # Generate timestamps
    start_time = datetime.datetime.strptime("02/14-00:00:00.000000", "%m/%d-%H:%M:%S.%f")
    synthetic_data['timestamp'] = [(start_time + timedelta(seconds=i)).strftime('%m/%d-%H:%M:%S.%f') for i in range(num_samples)]

    # Randomly select categorical columns
    for col in ['alert', 'classification', 'protocol']:
        synthetic_data[col] = np.random.choice(original_df[col].unique(), num_samples)

    # Generate priority in range [1,5]
    synthetic_data['priority'] = np.random.randint(1, 6, num_samples)

    # Generate source and destination IPs
    synthetic_data['src_ip'] = [f"{np.random.randint(1, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}" for _ in range(num_samples)]
    synthetic_data['dst_ip'] = [f"{np.random.randint(1, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}" for _ in range(num_samples)]

    # Generate valid port numbers
    synthetic_data['src_port'] = np.random.randint(0, 65536, num_samples)
    synthetic_data['dst_port'] = np.random.randint(0, 65536, num_samples)

    return synthetic_data


def main():
    # Load and preprocess data
    file_path = '/content/simulated_traffic.csv'
    processed_data, numerical_columns, categorical_columns = preprocess_csv_with_dummies(file_path)

    # Split data
    X = processed_data.values
    y = processed_data['classification'].values
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Initialize models
    original_dim = X.shape[1]
    latent_dim = 32
    vae_latent_dim = 16
    num_classes = len(np.unique(y))

    # Train VAE
    vae = VAE(original_dim, vae_latent_dim)
    vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
    vae.fit(X_train, X_train, validation_data=(X_val, X_val), epochs=50, batch_size=64)

    # Train WGAN
    generator = Generator(original_dim)
    discriminator = Discriminator()
    train_wgan(generator, discriminator, X_train, latent_dim)

    # Train Classifier
    classifier = Classifier(original_dim, num_classes)
    history = train_classifier(classifier, X_train, y_train, X_val, y_val)

    # Evaluate classifier
    test_loss, test_accuracy = classifier.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy:.4f}")

    # Save models and preprocessing objects
    tf.keras.models.save_model(classifier, 'classifier_model.keras')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(label_encoders, 'label_encoders.pkl')

    # Generate synthetic data
    num_samples = 1000
    z = tf.random.normal((num_samples, latent_dim))
    synthetic_data = generate_synthetic_data(num_samples, processed_data)

    # Denormalize synthetic data
    synthetic_df = denormalize_data(synthetic_data, processed_data, numerical_columns, categorical_columns, label_encoders, scaler)

    # Save synthetic data
    synthetic_df = pd.DataFrame(synthetic_data, columns=processed_data.columns)
    synthetic_df.to_csv('synthetic_traffic.csv', index=False)

if __name__ == "__main__":
    main()

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import joblib

# Global variables
scaler = None
label_encoders = {}

def preprocess_csv_with_dummies(file_path, is_training=True):
    """
    Preprocess CSV data with specific features
    """
    global scaler, label_encoders

    df = pd.read_csv(file_path)

    # Define numerical and categorical columns
    numerical_columns = ['src_port', 'dst_port', 'priority']
    categorical_columns = ['alert', 'classification', 'protocol']

    # Handle categorical columns with LabelEncoder
    for col in categorical_columns:
        if is_training:
            label_encoders[col] = LabelEncoder()
            df[col] = label_encoders[col].fit_transform(df[col])
        else:
            df[col] = label_encoders[col].transform(df[col])

    # Normalize numerical columns
    if is_training:
        scaler = MinMaxScaler()
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = scaler.transform(df[numerical_columns])

    # Store timestamp and IP addresses separately as they don't need preprocessing
    timestamp_data = df['timestamp']
    src_ip_data = df['src_ip']
    dst_ip_data = df['dst_ip']

    # Drop timestamp and IP columns for training
    df = df.drop(['timestamp', 'src_ip', 'dst_ip'], axis=1)

    return (df.astype(np.float32), numerical_columns, categorical_columns,
            timestamp_data, src_ip_data, dst_ip_data)


class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(tf.keras.Model):
    def __init__(self, original_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = self.build_encoder(original_dim, latent_dim)
        self.decoder = self.build_decoder(original_dim, latent_dim)
        self.latent_dim = latent_dim

    def build_encoder(self, original_dim, latent_dim):
        inputs = tf.keras.layers.Input(shape=(original_dim,))
        x = tf.keras.layers.Dense(256, activation="relu")(inputs)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(128, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        z_mean = tf.keras.layers.Dense(latent_dim)(x)
        z_log_var = tf.keras.layers.Dense(latent_dim)(x)
        z = Sampling()([z_mean, z_log_var])
        return tf.keras.Model(inputs, [z_mean, z_log_var, z], name="encoder")

    def build_decoder(self, original_dim, latent_dim):
        latent_inputs = tf.keras.layers.Input(shape=(latent_dim,))
        x = tf.keras.layers.Dense(128, activation="relu")(latent_inputs)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(256, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        outputs = tf.keras.layers.Dense(original_dim, activation="sigmoid")(x)
        return tf.keras.Model(latent_inputs, outputs, name="decoder")

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        reconstruction_loss = tf.keras.losses.mse(inputs, reconstructed)
        reconstruction_loss *= tf.cast(tf.shape(inputs)[1], tf.float32)
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        self.add_loss(tf.reduce_mean(reconstruction_loss + kl_loss))
        return reconstructed

class Generator(tf.keras.Model):
    def __init__(self, data_dim):
        super(Generator, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(data_dim, activation='tanh')
        ])

    def call(self, inputs):
        return self.model(inputs)

class Discriminator(tf.keras.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        return self.model(inputs)

class Classifier(tf.keras.Model):
    def __init__(self, input_dim, num_classes):
        super(Classifier, self).__init__()
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(num_classes, activation='softmax')
        ])

    def call(self, inputs):
        return self.model(inputs)

def wasserstein_loss(y_true, y_pred):
    return tf.reduce_mean(y_true * y_pred)

def gradient_penalty(discriminator, real_samples, fake_samples):
    alpha = tf.random.uniform([tf.shape(real_samples)[0], 1], 0.0, 1.0)
    interpolated = alpha * real_samples + (1 - alpha) * fake_samples

    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        predictions = discriminator(interpolated)

    gradients = tape.gradient(predictions, interpolated)
    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1))
    gradient_penalty = tf.reduce_mean(tf.square(slopes - 1.0))
    return gradient_penalty

def train_wgan(generator, discriminator, data, latent_dim, batch_size=64, epochs=100, n_critic=5, gp_weight=10.0):
    gen_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)
    disc_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)

    for epoch in range(epochs):
        for _ in range(n_critic):
            # Train Discriminator
            with tf.GradientTape() as disc_tape:
                batch_indices = np.random.randint(0, data.shape[0], batch_size)
                real_data = data[batch_indices]

                z = tf.random.normal((batch_size, latent_dim))
                fake_data = generator(z, training=True)

                real_output = discriminator(real_data, training=True)
                fake_output = discriminator(fake_data, training=True)

                gp = gradient_penalty(discriminator, real_data, fake_data)
                disc_loss = tf.reduce_mean(fake_output) - tf.reduce_mean(real_output) + gp_weight * gp

            grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
            disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_variables))

        # Train Generator
        with tf.GradientTape() as gen_tape:
            z = tf.random.normal((batch_size, latent_dim))
            fake_data = generator(z, training=True)
            fake_output = discriminator(fake_data, training=True)
            gen_loss = -tf.reduce_mean(fake_output)

        grads_gen = gen_tape.gradient(gen_loss, generator.trainable_variables)
        gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_variables))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"Generator Loss: {gen_loss:.4f}")
            print(f"Discriminator Loss: {disc_loss:.4f}")

def train_classifier(classifier, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    classifier.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5,
        restore_best_weights=True
    )

    history = classifier.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping]
    )

    return history

def predict_traffic(classifier, data, numerical_columns, categorical_columns):
    """
    Predict traffic classification for new data
    """
    processed_data, _, _ = preprocess_csv_with_dummies(data, is_training=False)
    predictions = classifier.predict(processed_data)
    predicted_classes = np.argmax(predictions, axis=1)

    # Convert numeric predictions back to original labels
    original_labels = label_encoders['classification'].inverse_transform(predicted_classes)

    return original_labels

def denormalize_data(synthetic_data, original_df, numerical_columns, categorical_columns):
    """
    Denormalize synthetic data to match original format
    """
    # Convert to DataFrame
    df = pd.DataFrame(synthetic_data, columns=['alert', 'classification', 'priority',
                                             'protocol', 'src_port', 'dst_port'])

    # Denormalize numerical columns
    if scaler:
        df[numerical_columns] = scaler.inverse_transform(df[numerical_columns])

        # Round ports and priority to integers and ensure valid ranges
        df['src_port'] = np.round(df['src_port']).astype(int).clip(1, 65535)
        df['dst_port'] = np.round(df['dst_port']).astype(int).clip(1, 65535)
        df['priority'] = np.round(df['priority']).astype(int).clip(1, 5)

    # Convert categorical columns back to original labels
    for col in categorical_columns:
        predicted_classes = np.argmax(df[col].values.reshape(-1, 1), axis=1)
        df[col] = label_encoders[col].inverse_transform(predicted_classes)

    # Generate timestamps in the correct format
    base_timestamp = pd.Timestamp('2025-02-14')
    df['timestamp'] = [
        (base_timestamp + pd.Timedelta(seconds=i)).strftime('%m/%d-%H:%M:%S.%f')
        for i in range(len(df))
    ]

    # Generate valid IP addresses
    df['src_ip'] = df.apply(lambda x: f"{np.random.randint(1,256)}.{np.random.randint(1,256)}."
                                     f"{np.random.randint(1,256)}.{np.random.randint(1,256)}", axis=1)
    df['dst_ip'] = df.apply(lambda x: f"{np.random.randint(1,256)}.{np.random.randint(1,256)}."
                                     f"{np.random.randint(1,256)}.{np.random.randint(1,256)}", axis=1)

    # Reorder columns to match original CSV
    df = df[['timestamp', 'alert', 'classification', 'priority', 'protocol',
             'src_ip', 'src_port', 'dst_ip', 'dst_port']]

    return df

# [Previous VAE, Generator, and Discriminator classes remain the same]

def generate_synthetic_data(generator, num_samples, latent_dim, original_df):
    """
    Generate synthetic network traffic data
    """
    # Generate raw synthetic data
    z = tf.random.normal((num_samples, latent_dim))
    synthetic_data = generator(z, training=False).numpy()

    # Denormalize the data
    numerical_columns = ['src_port', 'dst_port', 'priority']
    categorical_columns = ['alert', 'classification', 'protocol']

    denormalized_data = denormalize_data(
        synthetic_data,
        original_df,
        numerical_columns,
        categorical_columns
    )

    return denormalized_data

def main():
    # Load and preprocess data
    file_path = '/content/simulated_traffic.csv'
    original_df = pd.read_csv(file_path)

    # Process data
    (processed_data, numerical_columns, categorical_columns,
     timestamp_data, src_ip_data, dst_ip_data) = preprocess_csv_with_dummies(file_path)

    # Split data
    X = processed_data.values
    X_train, X_temp = train_test_split(X, test_size=0.3, random_state=42)
    X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

    # Initialize and train models
    original_dim = X.shape[1]
    latent_dim = 32

    # Train VAE
    vae = VAE(original_dim, latent_dim)
    vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
    vae.fit(X_train, X_train, validation_data=(X_val, X_val), epochs=50, batch_size=64)

    # Train WGAN
    generator = Generator(original_dim)
    discriminator = Discriminator()
    train_wgan(generator, discriminator, X_train, latent_dim)

    # Generate synthetic data
    num_samples = 1000
    synthetic_df = generate_synthetic_data(generator, num_samples, latent_dim, original_df)

    # Save synthetic data
    synthetic_df.to_csv('trial.csv', index=False)

    # Print sample comparison
    print("\nOriginal Data Sample:")
    print(original_df.head())
    print("\nSynthetic Data Sample:")
    print(synthetic_df.head())

    # Verify data structure
    print("\nColumn names match:",
          all(synthetic_df.columns == original_df.columns))
    print("\nData types:")
    for col in synthetic_df.columns:
        print(f"{col}: {synthetic_df[col].dtype}")

if __name__ == "__main__":
    main()

Epoch 1/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - loss: 16.1511 - val_loss: 14.1571
Epoch 2/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 14.4929 - val_loss: 14.1479
Epoch 3/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.1910 - val_loss: 14.1437
Epoch 4/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.1533 - val_loss: 14.1413
Epoch 5/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.2639 - val_loss: 14.1439
Epoch 6/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.3032 - val_loss: 14.1460
Epoch 7/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.2471 - val_loss: 14.1442
Epoch 8/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14.3863 - val_loss: 14.1434
Epoch 9/50
[1m55/55[0m [32m━━━━━━━━━