In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Dense,
    LeakyReLU,
    BatchNormalization,
    Input,
    Embedding,
    Flatten,
    Concatenate,
)
from sklearn.model_selection import train_test_split

# Step 1: Load Preprocessed Dataset
csv_path = "/content/preprocessed_nf_uq_nids_haaa.csv"
data = pd.read_csv(csv_path)

# Step 2: Split Features and Labels
X = data.drop(columns=["Label"]).values  # Features
y = data["Label"].values  # Labels

# Step 3: Define Generator
def build_generator(input_dim, condition_dim, output_dim):
    noise_input = Input(shape=(input_dim,))
    condition_input = Input(shape=(1,))

    # Embedding for condition
    condition_embedding = Embedding(input_dim=2, output_dim=10)(condition_input)  # For binary labels (0, 1)
    condition_embedding = Flatten()(condition_embedding)

    # Combine noise and condition
    combined_input = Concatenate()([noise_input, condition_embedding])

    # Sequential layers
    x = Dense(128)(combined_input)
    x = LeakyReLU(negative_slope=0.2)(x)
    x = BatchNormalization()(x)

    x = Dense(256)(x)
    x = LeakyReLU(negative_slope=0.2)(x)
    x = BatchNormalization()(x)

    output = Dense(output_dim, activation="tanh")(x)  # Output normalized to [-1, 1]

    return Model([noise_input, condition_input], output, name="Generator")

# Step 4: Define Discriminator
def build_discriminator(input_dim, condition_dim):
    feature_input = Input(shape=(input_dim,))
    condition_input = Input(shape=(1,))

    # Embedding for condition
    condition_embedding = Embedding(input_dim=2, output_dim=10)(condition_input)
    condition_embedding = Flatten()(condition_embedding)

    # Combine features and condition
    combined_input = Concatenate()([feature_input, condition_embedding])

    # Sequential layers
    x = Dense(256)(combined_input)
    x = LeakyReLU(negative_slope=0.2)(x)

    x = Dense(128)(x)
    x = LeakyReLU(negative_slope=0.2)(x)

    output = Dense(1, activation="sigmoid")(x)  # Output probability (real/fake)

    return Model([feature_input, condition_input], output, name="Discriminator")

# Step 5: Compile GAN
input_dim = X.shape[1]  # Number of features
latent_dim = 100  # Noise dimension

# Build models
generator = build_generator(latent_dim, 1, input_dim)
discriminator = build_discriminator(input_dim, 1)
discriminator.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

# Freeze discriminator when training generator
discriminator.trainable = False

# GAN model
noise_input = Input(shape=(latent_dim,))
condition_input = Input(shape=(1,))
generated_features = generator([noise_input, condition_input])
validity = discriminator([generated_features, condition_input])

gan = Model([noise_input, condition_input], validity)
gan.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002), loss="binary_crossentropy")

# Step 6: Train GAN
batch_size = 128
epochs = 100  # Reduced epochs for testing; increase once stable

real = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

for epoch in range(epochs):
    # Train Discriminator
    idx = np.random.randint(0, X.shape[0], batch_size)
    real_features, real_labels = X[idx], y[idx]

    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    fake_labels = np.random.randint(0, 2, batch_size)
    generated_features = generator.predict([noise, fake_labels])

    d_loss_real = discriminator.train_on_batch([real_features, real_labels], real)
    d_loss_fake = discriminator.train_on_batch([generated_features, fake_labels], fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train Generator
    sampled_labels = np.random.randint(0, 2, batch_size)
    g_loss = gan.train_on_batch([noise, sampled_labels], real)

    # Print progress
    if epoch % 10 == 0:
        print(f"Epoch {epoch}/{epochs}, D Loss: {d_loss[0]}, D Accuracy: {d_loss[1]}, G Loss: {g_loss}")

# Step 7: Generate Synthetic Data
num_samples = 10000
noise = np.random.normal(0, 1, (num_samples, latent_dim))
sampled_labels = np.random.randint(0, 2, num_samples)
synthetic_data = generator.predict([noise, sampled_labels])

# Save Synthetic Data
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns[:-1])
synthetic_df["Label"] = sampled_labels
synthetic_df.to_csv("synthetic_data1.csv", index=False)
print("Synthetic data saved as synthetic_data.csv")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  




Epoch 0/100, D Loss: 0.6040009260177612, D Accuracy: 0.8046875, G Loss: [array(0.6436981, dtype=float32), array(0.6436981, dtype=float32), array(0.609375, dtype=float32)]
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Epoch 10/100, D Loss: 0.6455023288726807, D Accuracy: 0.5916277766227722, G Loss: [array(0.64747167, dtype=fl