In [None]:
# Install required packages in Google Colab
!pip install ctgan gradio


Collecting ctgan
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting rdt>=1.14.0 (from ctgan)
  Downloading rdt-1.17.0-py3-none-any.whl.metadata (10 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 k

In [None]:
import pandas as pd
import tempfile
import math
from ctgan import CTGAN
import gradio as gr

In [None]:
def generate_synthetic_data(file, epochs, num_samples):
    # Load uploaded CSV
    df = pd.read_csv(file.name)

    # Automatically detect discrete (categorical) columns
    discrete_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define chunking strategy
    chunk_size = 1000
    num_chunks = math.ceil(len(df) / chunk_size)
    generated_data = []

    for i in range(num_chunks):
        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size].reset_index(drop=True)

        # Skip very small chunks
        if len(chunk) < 10:
            continue

        # Recompute discrete columns for this chunk
        chunk_discrete = chunk.select_dtypes(include=['object', 'category']).columns.tolist()

        # Train CTGAN on the chunk
        ctgan = CTGAN()
        ctgan.fit(chunk, chunk_discrete, epochs=int(epochs))

        # Sample synthetic data
        samples = ctgan.sample(int(num_samples / num_chunks))
        generated_data.append(samples)

    # Combine all generated samples
    final_df = pd.concat(generated_data, ignore_index=True)

    # Save to temporary CSV
    temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
    final_df.to_csv(temp_path, index=False)

    return temp_path

iface = gr.Interface(
    fn=generate_synthetic_data,
    inputs=[
        gr.File(label="Upload CSV"),
        gr.Number(label="Epochs", value=5),
        gr.Number(label="Number of Samples", value=100)
    ],
    outputs=gr.File(label="Download Synthetic CSV"),
    title="CTGAN Synthesizer",
    description="Upload a dataset, train CTGAN in memory-safe batches, and download synthetic data."
)

# iface.launch(share=True)
iface.launch(share=True, show_error=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7014e1db5ab8050ab7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


