# 04 - Synthetic Data Generation

This notebook demonstrates:
- Preprocessing tabular features for generative modeling
- Training a CTGAN model
- Generating synthetic samples
- Exporting an augmented dataset

In [None]:
# Load dataset
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('df.csv')

# Define columns
numerical_columns = ['AGE', 'avg_DIAMETRO', 'n_dislocazioni', 'avg_x', 'avg_y']
categorical_columns = ['TIPO', 'materiale_1', 'materiale_2', 'materiale_3', 'materiale_4', 'MONTH', 'YEAR']

# Filter only columns that exist
numerical_columns = [col for col in numerical_columns if col in df.columns]
categorical_columns = [col for col in categorical_columns if col in df.columns]

# Normalize numerical features
if numerical_columns:
    df[numerical_columns] = MinMaxScaler().fit_transform(df[numerical_columns])

In [None]:
# Train CTGAN
from ctgan import CTGAN

ctgan = CTGAN(
    embedding_dim=128,
    generator_dim=(256, 256),
    discriminator_dim=(256, 256),
    generator_lr=2e-4,
    discriminator_lr=2e-4,
    batch_size=500,
    epochs=100,
    verbose=True
)
columns_for_training = numerical_columns + categorical_columns
ctgan.fit(df[columns_for_training], discrete_columns=categorical_columns)

In [None]:
# Generate synthetic samples
n_samples = 1200
synthetic_data = ctgan.sample(n_samples)

# Align columns with original
for col in df.columns:
    if col not in synthetic_data.columns:
        synthetic_data[col] = 0
synthetic_data = synthetic_data[df.columns]

In [None]:
# Save to CSV
synthetic_data.to_csv('augmented_gas_dispersion_data_ctgan.csv', index=False)
print('Synthetic dataset saved.')