In [1]:
# Step 1: Install needed packages
!pip install pandas numpy tensorflow faker

# Step 2: Import
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers

# Step 3: Load your CSV file
df = pd.read_csv('/content/Customer.csv')  # Replace with your uploaded file

# Step 4: Convert DOB to age
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
today = pd.to_datetime('today')
df['age'] = (today.year - df['date_of_birth'].dt.year).fillna(30).astype(int)

# Step 5: Normalize the age values between -1 and 1
scaler = MinMaxScaler(feature_range=(-1, 1))
age_scaled = scaler.fit_transform(df[['age']])

# Step 6: Define the Generator
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(10,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='tanh')  # Output a single value
    ])
    return model

# Step 7: Define the Discriminator
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(1,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # 1 for real, 0 for fake
    ])
    return model

# Step 8: Instantiate the models
generator = build_generator()
discriminator = build_discriminator()

# Step 9: Compile the Discriminator
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 10: Build and Compile the GAN
discriminator.trainable = False
gan_input = tf.keras.Input(shape=(10,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')


Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.3.0


  df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')


In [2]:
# Rebuild Generator
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(10,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='tanh')
    ])
    return model

# Rebuild Discriminator
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(1,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

# Re-create the models
generator = build_generator()
discriminator = build_discriminator()

# Compile the discriminator first
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Freeze discriminator
discriminator.trainable = False

# Build GAN model
gan_input = tf.keras.Input(shape=(10,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)

# Compile GAN
gan.compile(optimizer='adam', loss='binary_crossentropy')

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume generator is your trained Keras model from before
num_samples = 1000
noise_dim = 10  # match your generator input

noise = np.random.normal(0, 1, (num_samples, noise_dim))
generated_scaled_ages = generator.predict(noise)

synthetic_ages = scaler.inverse_transform(generated_scaled_ages)
synthetic_ages = synthetic_ages.flatten().astype(int)

print(synthetic_ages[:10])

# Inverse transform to original age scale
# Assuming you used MinMaxScaler fitted on real ages earlier as 'scaler'
synthetic_ages = scaler.inverse_transform(generated_scaled_ages)

# Convert to integer ages
synthetic_ages = synthetic_ages.flatten().astype(int)

print(synthetic_ages[:10])  # Check first 10 synthetic ages

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[30 30 30 30 30 30 30 30 30 30]
[30 30 30 30 30 30 30 30 30 30]


In [5]:
# Step 11: Train the GAN
epochs = 5000
batch_size = 32

for epoch in range(epochs):
    # Real samples
    idx = np.random.randint(0, age_scaled.shape[0], batch_size)
    real_ages = age_scaled[idx]

    # Fake samples
    noise = np.random.normal(0, 1, (batch_size, 10))
    fake_ages = generator.predict(noise, verbose=0)

    # Train discriminator
    d_loss_real = discriminator.train_on_batch(real_ages, np.ones((batch_size, 1)))
    d_loss_fake = discriminator.train_on_batch(fake_ages, np.zeros((batch_size, 1)))

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, 10))
    g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    # Print progress
    if epoch % 100 == 0:
        print(f"{epoch} [D loss: {(d_loss_real[0] + d_loss_fake[0]):.4f}] [G loss: {g_loss:.4f}]")

0 [D loss: 1.3089] [G loss: 0.6188]
100 [D loss: 1.3398] [G loss: 0.5936]
200 [D loss: 1.3480] [G loss: 0.5870]
300 [D loss: 1.3518] [G loss: 0.5841]
400 [D loss: 1.3539] [G loss: 0.5824]
500 [D loss: 1.3552] [G loss: 0.5813]
600 [D loss: 1.3562] [G loss: 0.5805]
700 [D loss: 1.3569] [G loss: 0.5800]
800 [D loss: 1.3574] [G loss: 0.5795]
900 [D loss: 1.3579] [G loss: 0.5792]
1000 [D loss: 1.3582] [G loss: 0.5789]
1100 [D loss: 1.3585] [G loss: 0.5787]
1200 [D loss: 1.3587] [G loss: 0.5785]
1300 [D loss: 1.3589] [G loss: 0.5783]
1400 [D loss: 1.3591] [G loss: 0.5782]
1500 [D loss: 1.3593] [G loss: 0.5781]
1600 [D loss: 1.3594] [G loss: 0.5780]
1700 [D loss: 1.3595] [G loss: 0.5779]
1800 [D loss: 1.3596] [G loss: 0.5778]
1900 [D loss: 1.3597] [G loss: 0.5777]
2000 [D loss: 1.3598] [G loss: 0.5776]
2100 [D loss: 1.3599] [G loss: 0.5776]
2200 [D loss: 1.3600] [G loss: 0.5775]
2300 [D loss: 1.3600] [G loss: 0.5775]
2400 [D loss: 1.3601] [G loss: 0.5774]
2500 [D loss: 1.3602] [G loss: 0.5774

In [3]:
pip install ctgan



In [7]:
import pandas as pd
from ctgan import CTGAN
# Load your real dataset
real_data = pd.read_csv("Customer.csv")

# Drop unique ID for training
real_data = real_data.drop(columns=["customer_id"])

# Define categorical columns
categorical_columns = ["first_name", "last_name", "date_of_birth", "address", "phone_number"]

# Initialize CTGAN model
ctgan = CTGAN(epochs=300)  # train for 300 epochs

# Train the model
ctgan.fit(real_data, discrete_columns=categorical_columns)

# Generate synthetic data samples
synthetic_data = ctgan.sample(500)

# Add synthetic UUIDs for customer_id
import uuid
synthetic_data["customer_id"] = [str(uuid.uuid4()) for _ in range(len(synthetic_data))]

# Reorder columns
cols = ["customer_id"] + [col for col in synthetic_data.columns if col != "customer_id"]
synthetic_data = synthetic_data[cols]

print(synthetic_data.head())

synthetic_data.to_csv("synthetic_CTGan_customers.csv", index=False)


                            customer_id first_name last_name date_of_birth  \
0  12736124-0ff1-4c46-8ffc-b8b797afef20     Smythe   Casarez    10/17/1971   
1  42df858e-a5b1-40d7-9c76-67d63ff49661    Killian     Shine     6/17/1971   
2  dcc5122b-22c5-4fea-9972-4acc7d459de8    Salazar  Trinidad    11/10/1971   
3  9b38282f-dfda-4c8d-bbab-882de055e8e6    Varnado    Urbano      6/1/1971   
4  4a7dd2a9-3a61-4107-8f91-fcbc4840e873  Cardinale      Maya    10/17/1971   

      address    phone_number  
0    Sec-1998  (557) 557-7957  
1    Sec-1158  (713) 413-4513  
2      B-1681  (750) 450-5150  
3      D-1244  (620) 620-9220  
4  Block-1418  (656) 256-7456  
