In [None]:
# Step 1: Install needed packages
!pip install pandas numpy tensorflow faker

# Step 2: Import
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers

# Step 3: Load your CSV file
df = pd.read_csv('/content/Customer.csv')  # Replace with your uploaded file

# Step 4: Convert DOB to age
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
today = pd.to_datetime('today')
df['age'] = (today.year - df['date_of_birth'].dt.year).fillna(30).astype(int)

# Step 5: Normalize the age values between -1 and 1
scaler = MinMaxScaler(feature_range=(-1, 1))
age_scaled = scaler.fit_transform(df[['age']])

# Step 6: Define the Generator
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(10,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='tanh')  # Output a single value
    ])
    return model

# Step 7: Define the Discriminator
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(1,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # 1 for real, 0 for fake
    ])
    return model

# Step 8: Instantiate the models
generator = build_generator()
discriminator = build_discriminator()

# Step 9: Compile the Discriminator
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 10: Build and Compile the GAN
discriminator.trainable = False
gan_input = tf.keras.Input(shape=(10,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')


Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.3.0


  df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')


In [None]:
# Rebuild Generator
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(10,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='tanh')
    ])
    return model

# Rebuild Discriminator
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(1,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

# Re-create the models
generator = build_generator()
discriminator = build_discriminator()

# Compile the discriminator first
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Freeze discriminator
discriminator.trainable = False

# Build GAN model
gan_input = tf.keras.Input(shape=(10,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)

# Compile GAN
gan.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume generator is your trained Keras model from before
num_samples = 1000
noise_dim = 10  # match your generator input

noise = np.random.normal(0, 1, (num_samples, noise_dim))
generated_scaled_ages = generator.predict(noise)

synthetic_ages = scaler.inverse_transform(generated_scaled_ages)
synthetic_ages = synthetic_ages.flatten().astype(int)

print(synthetic_ages[:10])

# Inverse transform to original age scale
# Assuming you used MinMaxScaler fitted on real ages earlier as 'scaler'
synthetic_ages = scaler.inverse_transform(generated_scaled_ages)

# Convert to integer ages
synthetic_ages = synthetic_ages.flatten().astype(int)

print(synthetic_ages[:10])  # Check first 10 synthetic ages

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[30 30 30 30 30 30 30 30 30 30]
[30 30 30 30 30 30 30 30 30 30]


In [None]:
# Step 11: Train the GAN
epochs = 5000
batch_size = 32

for epoch in range(epochs):
    # Real samples
    idx = np.random.randint(0, age_scaled.shape[0], batch_size)
    real_ages = age_scaled[idx]

    # Fake samples
    noise = np.random.normal(0, 1, (batch_size, 10))
    fake_ages = generator.predict(noise, verbose=0)

    # Train discriminator
    d_loss_real = discriminator.train_on_batch(real_ages, np.ones((batch_size, 1)))
    d_loss_fake = discriminator.train_on_batch(fake_ages, np.zeros((batch_size, 1)))

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, 10))
    g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    # Print progress
    if epoch % 100 == 0:
        print(f"{epoch} [D loss: {(d_loss_real[0] + d_loss_fake[0]):.4f}] [G loss: {g_loss:.4f}]")

0 [D loss: 1.3089] [G loss: 0.6188]
100 [D loss: 1.3398] [G loss: 0.5936]
200 [D loss: 1.3480] [G loss: 0.5870]
300 [D loss: 1.3518] [G loss: 0.5841]
400 [D loss: 1.3539] [G loss: 0.5824]
500 [D loss: 1.3552] [G loss: 0.5813]
600 [D loss: 1.3562] [G loss: 0.5805]
700 [D loss: 1.3569] [G loss: 0.5800]
800 [D loss: 1.3574] [G loss: 0.5795]
900 [D loss: 1.3579] [G loss: 0.5792]
1000 [D loss: 1.3582] [G loss: 0.5789]
1100 [D loss: 1.3585] [G loss: 0.5787]
1200 [D loss: 1.3587] [G loss: 0.5785]
1300 [D loss: 1.3589] [G loss: 0.5783]
1400 [D loss: 1.3591] [G loss: 0.5782]
1500 [D loss: 1.3593] [G loss: 0.5781]
1600 [D loss: 1.3594] [G loss: 0.5780]
1700 [D loss: 1.3595] [G loss: 0.5779]
1800 [D loss: 1.3596] [G loss: 0.5778]
1900 [D loss: 1.3597] [G loss: 0.5777]
2000 [D loss: 1.3598] [G loss: 0.5776]
2100 [D loss: 1.3599] [G loss: 0.5776]
2200 [D loss: 1.3600] [G loss: 0.5775]
2300 [D loss: 1.3600] [G loss: 0.5775]
2400 [D loss: 1.3601] [G loss: 0.5774]
2500 [D loss: 1.3602] [G loss: 0.5774

In [None]:
pip install ctgan



In [None]:
import pandas as pd
from ctgan import CTGAN
# Load your real dataset
real_data = pd.read_csv("Customer.csv")

# Drop unique ID for training
real_data = real_data.drop(columns=["customer_id"])

# Define categorical columns
categorical_columns = ["first_name", "last_name", "date_of_birth", "address", "phone_number"]

# Initialize CTGAN model
ctgan = CTGAN(epochs=300)  # train for 300 epochs

# Train the model
ctgan.fit(real_data, discrete_columns=categorical_columns)

# Generate synthetic data samples
synthetic_data = ctgan.sample(500)

# Add synthetic UUIDs for customer_id
import uuid
synthetic_data["customer_id"] = [str(uuid.uuid4()) for _ in range(len(synthetic_data))]

# Reorder columns
cols = ["customer_id"] + [col for col in synthetic_data.columns if col != "customer_id"]
synthetic_data = synthetic_data[cols]

print(synthetic_data.head())

synthetic_data.to_csv("synthetic_CTGan_customers.csv", index=False)


                            customer_id first_name last_name date_of_birth  \
0  12736124-0ff1-4c46-8ffc-b8b797afef20     Smythe   Casarez    10/17/1971   
1  42df858e-a5b1-40d7-9c76-67d63ff49661    Killian     Shine     6/17/1971   
2  dcc5122b-22c5-4fea-9972-4acc7d459de8    Salazar  Trinidad    11/10/1971   
3  9b38282f-dfda-4c8d-bbab-882de055e8e6    Varnado    Urbano      6/1/1971   
4  4a7dd2a9-3a61-4107-8f91-fcbc4840e873  Cardinale      Maya    10/17/1971   

      address    phone_number  
0    Sec-1998  (557) 557-7957  
1    Sec-1158  (713) 413-4513  
2      B-1681  (750) 450-5150  
3      D-1244  (620) 620-9220  
4  Block-1418  (656) 256-7456  


multiple files


In [None]:
# Uninstall SDV completely first (clears broken or old install)
!pip uninstall -y sdv
!pip install -U pip setuptools wheel

# Reinstall the correct latest SDV version (v1.0.0+)
!pip install sdv==1.0.1



Found existing installation: sdv 1.22.1
Uninstalling sdv-1.22.1:
  Successfully uninstalled sdv-1.22.1
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-80.9.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.2.0
    Uninstalling setuptools-75.2.0:
      Successfully uninstalled setuptools-75.2.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dep

[31mERROR: Ignored the following yanked versions: 1.13.0[0m[31m
[0m[31mERROR: Ignored the following versions that require a different python version: 0.10.0 Requires-Python >=3.6,<3.9; 0.10.0.dev0 Requires-Python >=3.6,<3.9; 0.10.1 Requires-Python >=3.6,<3.9; 0.10.1.dev0 Requires-Python >=3.6,<3.9; 0.11.0 Requires-Python >=3.6,<3.9; 0.11.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0 Requires-Python >=3.6,<3.9; 0.12.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0.dev1 Requires-Python >=3.6,<3.9; 0.12.1 Requires-Python >=3.6,<3.9; 0.12.1.dev0 Requires-Python >=3.6,<3.9; 0.13.0 Requires-Python >=3.6,<3.10; 0.13.0.dev0 Requires-Python >=3.6,<3.10; 0.13.1 Requires-Python >=3.6,<3.10; 0.13.1.dev0 Requires-Python >=3.6,<3.10; 0.14.0 Requires-Python >=3.6,<3.10; 0.14.0.dev0 Requires-Python >=3.6,<3.10; 0.14.0.dev1 Requires-Python >=3.6,<3.10; 0.14.0.dev2 Requires-Python >=3.6,<3.10; 0.14.1 Requires-Python >=3.6,<3.10; 0.14.1.dev0 Requires-Python >=3.6,<3.10; 0.15.0 Requires-Python >=3.6,<3.10; 0.15

In [None]:
# Install compatible SDV version
!pip uninstall -y sdv
!pip install sdv==1.0.1


Found existing installation: sdv 1.22.1
Uninstalling sdv-1.22.1:
  Successfully uninstalled sdv-1.22.1
[31mERROR: Ignored the following yanked versions: 1.13.0[0m[31m
[0m[31mERROR: Ignored the following versions that require a different python version: 0.10.0 Requires-Python >=3.6,<3.9; 0.10.0.dev0 Requires-Python >=3.6,<3.9; 0.10.1 Requires-Python >=3.6,<3.9; 0.10.1.dev0 Requires-Python >=3.6,<3.9; 0.11.0 Requires-Python >=3.6,<3.9; 0.11.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0 Requires-Python >=3.6,<3.9; 0.12.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0.dev1 Requires-Python >=3.6,<3.9; 0.12.1 Requires-Python >=3.6,<3.9; 0.12.1.dev0 Requires-Python >=3.6,<3.9; 0.13.0 Requires-Python >=3.6,<3.10; 0.13.0.dev0 Requires-Python >=3.6,<3.10; 0.13.1 Requires-Python >=3.6,<3.10; 0.13.1.dev0 Requires-Python >=3.6,<3.10; 0.14.0 Requires-Python >=3.6,<3.10; 0.14.0.dev0 Requires-Python >=3.6,<3.10; 0.14.0.dev1 Requires-Python >=3.6,<3.10; 0.14.0.dev2 Requires-Python >=3.6,<3.10; 0.14.1 Require

In [None]:
# STEP 1: Import Libraries
import pandas as pd
import numpy as np
import os


# NEW: Import CTGAN-compatible modules from SDV
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata



In [None]:
from google.colab import files

# STEP 2: Upload CSV files (Upload all 10: 4 lookup + 6 main)
print("Please upload your 10 CSV files: 4 lookup tables + 6 main tables")
uploaded = files.upload()

# STEP 3: Read all uploaded tables
dataframes = {}
for file_name in uploaded.keys():
    df_name = os.path.splitext(file_name)[0]
    dataframes[df_name] = pd.read_csv(file_name)
print(f"\n✅ Uploaded Tables: {list(dataframes.keys())}")

# STEP 4: Define which are lookup and which are main tables
lookup_tables = ['PolicyStatus', 'PolicyType', 'CoverageType', 'PaymentMethod']
main_tables = ['Customer', 'Policy', 'Coverage', 'Premium', 'Claim', 'Beneficiary']

# # STEP 5: Synthetic Data Generator using CTGAN
# def generate_ctgan(data, table_name, epochs=300):
#     metadata = SingleTableMetadata()
#     metadata.detect_from_dataframe(data=data)

#     model = CTGANSynthesizer(epochs=epochs)
#     model.fit(data)
#     synthetic = model.sample(num_rows=len(data))

#     score = evaluate_quality(data, synthetic, metadata)
#     print(f"Generated: {table_name} | Quality Score: {score['Quality Score']:.2f}")
#     return synthetic


def generate_ctgan(data, table_name, epochs=300):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=data)

    # Pass metadata explicitly when creating the model
    model = CTGANSynthesizer(metadata=metadata, epochs=epochs)
    model.fit(data)
    synthetic = model.sample(num_rows=len(data))
    print(f"✅ Generated synthetic data for {table_name}")
    return synthetic



Please upload your 10 CSV files: 4 lookup tables + 6 main tables


Saving Beneficiary.csv to Beneficiary.csv
Saving Claim.csv to Claim.csv
Saving Coverage.csv to Coverage.csv
Saving CoverageType.csv to CoverageType.csv
Saving Customer.csv to Customer.csv
Saving PaymentMethod.csv to PaymentMethod.csv
Saving Policy.csv to Policy.csv
Saving PolicyStatus.csv to PolicyStatus.csv
Saving PolicyType.csv to PolicyType.csv
Saving Premium.csv to Premium.csv

✅ Uploaded Tables: ['Beneficiary', 'Claim', 'Coverage', 'CoverageType', 'Customer', 'PaymentMethod', 'Policy', 'PolicyStatus', 'PolicyType', 'Premium']


In [None]:
print(list(dataframes.keys()))


['Beneficiary', 'Claim', 'Coverage', 'CoverageType', 'Customer', 'PaymentMethod', 'Policy', 'PolicyStatus', 'PolicyType', 'Premium']


In [None]:
# STEP 6: Generate synthetic Customer table (independent)
synthetic_data = {}
synthetic_data['Customer'] = generate_ctgan(dataframes['Customer'], 'Customer')

# STEP 7: Generate synthetic Policy table
policy_input = dataframes['Policy'].copy()
policy_input['customer_id'] = np.random.choice(synthetic_data['Customer']['customer_id'], size=len(policy_input))
policy_input['status_id'] = np.random.choice(dataframes['PolicyStatus']['status_id'], size=len(policy_input))
policy_input['type_id'] = np.random.choice(dataframes['PolicyType']['type_id'], size=len(policy_input))
synthetic_data['Policy'] = generate_ctgan(policy_input, 'Policy')

# STEP 8: Generate Coverage table
coverage_input = dataframes['Coverage'].copy()
coverage_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(coverage_input))
coverage_input['coverage_id'] = np.random.choice(dataframes['CoverageType']['coverage_id'], size=len(coverage_input))
synthetic_data['Coverage'] = generate_ctgan(coverage_input, 'Coverage')

# STEP 9: Generate Premium table
premium_input = dataframes['Premium'].copy()
premium_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(premium_input))
premium_input['method_id'] = np.random.choice(dataframes['PaymentMethod']['method_id'], size=len(premium_input))
synthetic_data['Premium'] = generate_ctgan(premium_input, 'Premium')



✅ Generated synthetic data for Customer




✅ Generated synthetic data for Policy




✅ Generated synthetic data for Coverage




✅ Generated synthetic data for Premium


In [None]:
from google.colab import files

for table_name, df in synthetic_data.items():
    filename = f"{table_name}_synthetic.csv"
    df.to_csv(filename, index=False)
    files.download(filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# STEP 0: Install required library
!pip install -q sdv

# STEP 1: Import libraries
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from google.colab import files

# STEP 2: Upload CSVs
uploaded = files.upload()  # Upload all 10 CSV files when prompted

# STEP 3: Read all files into a dictionary
dataframes = {filename.replace('.csv', ''): pd.read_csv(filename) for filename in uploaded}

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/180.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.2/180.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.8/73.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.5/193.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

Saving Beneficiary.csv to Beneficiary.csv
Saving Claim.csv to Claim.csv
Saving Coverage.csv to Coverage.csv
Saving CoverageType.csv to CoverageType.csv
Saving Customer.csv to Customer.csv
Saving PaymentMethod.csv to PaymentMethod.csv
Saving Policy.csv to Policy.csv
Saving PolicyStatus.csv to PolicyStatus.csv
Saving PolicyType.csv to PolicyType.csv
Saving Premium.csv to Premium.csv


In [3]:
for name, df in dataframes.items():
    print(f"\n{name} columns: {df.columns.tolist()}")



Beneficiary columns: ['beneficiary_id', 'policy_id', 'first_name', 'last_name', 'date_of_birth', 'relationship']

Claim columns: ['claim_id', 'policy_id', 'date_filed', 'claim_amount', 'description']

Coverage columns: ['coverage_id', 'policy_id', 'coverage_amount', 'coverage_type_id']

CoverageType columns: ['coverage_id', 'coverage_name']

Customer columns: ['customer_id', 'first_name', 'last_name', 'date_of_birth', 'address', 'phone_number']

PaymentMethod columns: ['method_id', 'method_name']

Policy columns: ['policy_id', 'policy_number', 'start_date', 'end_date', 'status_id', 'type_id', 'customer_id']

PolicyStatus columns: ['status_id', 'status_name']

PolicyType columns: ['type_id', 'type_name']

Premium columns: ['premium_id', 'policy_id', 'amount', 'payment_method_id', 'due_date']


In [8]:


# STEP 4: Define synthetic sample sizes
samples = {
    'Customer': 100,
    'Policy': 200,
    'Coverage': 200,
    'Premium': 150,
    'Claim': 400,
    'Beneficiary': 300
}

# STEP 5: Reusable CTGAN function
def generate_ctgan(data, table_name, num_rows):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data)
    model = CTGANSynthesizer(metadata)
    model.fit(data)
    synthetic = model.sample(num_rows=num_rows)
    print(f"✅ Generated synthetic data for {table_name}")
    return synthetic

# STEP 6: Generate synthetic data
synthetic_data = {}

# Customer (independent)
synthetic_data['Customer'] = generate_ctgan(dataframes['Customer'], 'Customer', samples['Customer'])

# Policy
policy_input = dataframes['Policy'].copy()
policy_input['customer_id'] = np.random.choice(synthetic_data['Customer']['customer_id'], size=len(policy_input))
policy_input['status_id'] = np.random.choice(dataframes['PolicyStatus']['status_id'], size=len(policy_input))
policy_input['type_id'] = np.random.choice(dataframes['PolicyType']['type_id'], size=len(policy_input))
synthetic_data['Policy'] = generate_ctgan(policy_input, 'Policy', samples['Policy'])

# Coverage
coverage_input = dataframes['Coverage'].copy()
coverage_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(coverage_input))
coverage_input['coverage_type_id'] = np.random.choice(dataframes['CoverageType']['coverage_id'], size=len(coverage_input))
synthetic_data['Coverage'] = generate_ctgan(coverage_input, 'Coverage', samples['Coverage'])

# Premium
premium_input = dataframes['Premium'].copy()
premium_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(premium_input))
premium_input['payment_method_id'] = np.random.choice(dataframes['PaymentMethod']['method_id'], size=len(premium_input))
synthetic_data['Premium'] = generate_ctgan(premium_input, 'Premium', samples['Premium'])

# Claim
claim_input = dataframes['Claim'].copy()
claim_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(claim_input))
synthetic_data['Claim'] = generate_ctgan(claim_input, 'Claim', samples['Claim'])

# Beneficiary
beneficiary_input = dataframes['Beneficiary'].copy()
beneficiary_input['policy_id'] = np.random.choice(synthetic_data['Policy']['policy_id'], size=len(beneficiary_input))
synthetic_data['Beneficiary'] = generate_ctgan(beneficiary_input, 'Beneficiary', samples['Beneficiary'])

# STEP 7: Export synthetic tables
for table_name, df in synthetic_data.items():
    filename = f"{table_name}_synthetic.csv"
    df.to_csv(filename, index=False)
    files.download(filename)




✅ Generated synthetic data for Customer




✅ Generated synthetic data for Policy




✅ Generated synthetic data for Coverage




✅ Generated synthetic data for Premium




✅ Generated synthetic data for Claim




✅ Generated synthetic data for Beneficiary


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import re

def clean_and_format_phone(phone):
    # Remove non-digit characters
    digits = re.sub(r'\D', '', str(phone))
    # Take last 10 digits if longer
    digits = digits[-10:] if len(digits) >= 10 else digits
    # Format as (XXX) XXX-XXXX if 10 digits
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    else:
        return None  # or keep as is if not valid

# Apply this to the original data before training
dataframes['Customer']['phone_number'] = dataframes['Customer']['phone_number'].apply(clean_and_format_phone)

# Then again apply after generating synthetic data
synthetic_data['Customer']['phone_number'] = synthetic_data['Customer']['phone_number'].apply(clean_and_format_phone)
