# Synteettisen datan generointi

## Tarvittavat paketit

In [2]:
import numpy as np
import sdv
import scipy
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
import synthcity

from ucimlrepo import fetch_ucirepo 



import sdmetrics
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports import utils
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import display_bayesian_network

from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader




    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


## Datan valmistelu

Haetaan tässä esimerkiksi UCI repositoryn heart disease -data.
Tämän tilalle voi vaihtaa jonkun muun taulukkomuotoisen datan ja menetelmät toimivat samalla tavalla.

Oman datan saa käyttöön lukemalla sen "real_data_df" muuttujaan.

In [None]:
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
# print(heart_disease.metadata) 
  
# variable information 
# print(heart_disease.variables) 

# Lisätään ennustettava muuttuja osaksi DataFramea
# real_data_df = pd.read_excel("omadata.xslx")
real_data_df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)

Tarkistetaan puuttuvat arvot:

In [None]:
real_data_df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

Luodaan viisinkertainen ristiinvalidointi:

In [10]:
train_folds = []
test_folds = []

kf = KFold(n_splits=5, random_state=19, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(real_data_df)):
    train_folds.append(real_data_df.iloc[train_index])
    test_folds.append(real_data_df.iloc[test_index])

Täydennetään puuttuvat arvot eli imputoidaan. Tässä tapauksessa käytetään k-nearest-neighbor -imputointia:

In [12]:
col_names = real_data_df.columns

for i in range(kf.get_n_splits()):
    if train_folds[i].isna().sum().sum() > 0:
        imputer = KNNImputer(n_neighbors=5)
        train_folds[i] = pd.DataFrame(imputer.fit_transform(train_folds[i]), columns=col_names)
    if test_folds[i].isna().sum().sum() > 0:
        test_folds[i] = pd.DataFrame(imputer.transform(test_folds[i]),columns=col_names)
    

## Generointi

Eri paketit vaativat hieman eri syntaksin.

Asetuksia muokkaamalla voi saada parempia/huonompia tuloksia, mutta vakioasetukset tuottavat hyvän peruskäsityksen mallien toimintakyvystä.

In [8]:
def generate_TVAE_data(train_data, n_synth):
    if 'ID' in train_data.columns:
        train_data.drop('ID', axis=1, inplace=True)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(train_data)
    synth_data_sdv_TVAE = TVAESynthesizer(metadata,
                                               compress_dims=(256,128),
                                               decompress_dims=(128,256),
                                               embedding_dim = 64,
                                               enforce_min_max_values = False,
                                               epochs=3000)
    # synth_data_sdv_TVAE.add_constraints(constraints = constraint_list)

    synth_data_sdv_TVAE.fit(train_data)
    synth_data_sdv_TVAE = synth_data_sdv_TVAE.sample(n_synth)

    return synth_data_sdv_TVAE


In [9]:
def generate_CTGAN_data(train_data, n_synth):
    if 'ID' in train_data.columns:
        train_data.drop('ID', axis=1, inplace=True)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(train_data)
    synth_data_sdv_CTGAN = CTGANSynthesizer(metadata,
                                                enforce_min_max_values = False,
                                                epochs=3000)

    synth_data_sdv_CTGAN.fit(train_data)
    synth_data_sdv_CTGAN = synth_data_sdv_CTGAN.sample(n_synth)

    return synth_data_sdv_CTGAN

In [10]:
def generate_gaussiancopula_data(train_data, n_synth):
    if 'ID' in train_data.columns:
        train_data.drop('ID', axis=1, inplace=True)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(train_data)
    synth_data_sdv_gcopula = GaussianCopulaSynthesizer(metadata,
                                                enforce_min_max_values = False)

    synth_data_sdv_gcopula.fit(train_data)
    synth_data_sdv_gcopula = synth_data_sdv_gcopula.sample(n_synth)

    return synth_data_sdv_gcopula

In [11]:
synth_data_sdv_TVAE_list = []

for i in range(kf.get_n_splits()):
    synth_data_sdv_TVAE_i = generate_TVAE_data(train_folds[i], n_synth=1000)
    synth_data_sdv_TVAE_list.append(synth_data_sdv_TVAE_i)

In [12]:
synth_data_sdv_CTGAN_list = []

for i in range(kf.get_n_splits()):
    synth_data_sdv_CTGAN_i = generate_CTGAN_data(train_folds[i], n_synth=1000)
    synth_data_sdv_CTGAN_list.append(synth_data_sdv_CTGAN_i)

In [13]:
synth_data_sdv_gcopula_list = []

for i in range(kf.get_n_splits()):
    synth_data_sdv_gcopula_i = generate_gaussiancopula_data(train_folds[i], n_synth=1000)
    synth_data_sdv_gcopula_list.append(synth_data_sdv_CTGAN_i)

In [None]:
synth_data_synthcity_nflow_list = []

for i in range(kf.get_n_splits()):

    synthcity_nflow = Plugins().get("nflow")
    synthcity_nflow.fit(train_folds[i], verbose = False)
    synth_data_synthcity_nflow_i = synthcity_nflow.generate(count=1000, verbose = False)
    synth_data_synthcity_nflow_list.append(synth_data_synthcity_nflow_i.dataframe())

[2025-08-29T15:45:41.715745+0300][4358][CRITICAL] module disabled: /home/joonas/wdl/wdl_workfolder/.venv/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
[2025-08-29T15:45:41.717682+0300][4358][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-29T15:45:41.719060+0300][4358][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-29T15:45:41.720093+0300][4358][CRITICAL] module plugin_great load failed
 65%|██████▍   | 649/1000 [00:36<00:19, 18.01it/s]
[2025-08-29T15:46:19.710207+0300][4358][CRITICAL] module disabled: /home/joonas/wdl/wdl_workfolder/.venv/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
[2025-08-29T15:46:19.711449+0300][4358][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-29T15:46:19.712214+0300][4358][CRITICAL] load failed: module 'synthcity.plugins.generic.plugi

In [None]:
synth_data_synthcity_diffusion_list = []

for i in range(kf.get_n_splits()):
    synthcity_diffusion = Plugins().get("ddpm")
    synthcity_diffusion.fit(train_folds[i])
    synth_data_synthcity_diffusion_i = synthcity_diffusion.generate(count=1000)
    synth_data_synthcity_diffusion_list.append(synth_data_synthcity_diffusion_i.dataframe())

[2025-08-06T14:23:22.795394+0300][2198][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-06T14:23:22.796418+0300][2198][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-06T14:23:22.797065+0300][2198][CRITICAL] module plugin_great load failed
[2025-08-06T14:23:22.797684+0300][2198][CRITICAL] module disabled: /home/pekkamela/wdl/wdl_workfolder/.venv/lib/python3.10/site-packages/synthcity/plugins/generic/plugin_goggle.py
Epoch: 100%|██████████| 1000/1000 [00:18<00:00, 54.64it/s, loss=1.17]
[2025-08-06T14:24:04.714464+0300][2198][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-06T14:24:04.715069+0300][2198][CRITICAL] load failed: module 'synthcity.plugins.generic.plugin_great' has no attribute 'plugin'
[2025-08-06T14:24:04.715446+0300][2198][CRITICAL] module plugin_great load failed
[2025-08-06T14:24:04.715929+0300][2198

In [22]:
# Specify categorical attributes
categorical_attributes = {'sex': True, 'cp': True, 'fbs': True, 'restecg': True, 'exang': True,
                          'slope': True,'num': True}

# Define privacy settings
epsilon = 1
degree_of_bayesian_network = 2

synth_data_privbayes_list = []

for i in range(kf.get_n_splits()):
    train_folds[i].to_csv(f'datasynthesizer_files/train_fold_{i}.csv')


    # Initialize DataDescriber with category threshold
    describer = DataDescriber(category_threshold=5)
    # Describe the dataset to create a Bayesian network
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=f'./datasynthesizer_files/train_fold_{i}.csv', 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes
                                                        )
    
    describer.save_dataset_description_to_file(f'./datasynthesizer_files/description_{i}.json')
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(1000, f'./datasynthesizer_files/description_{i}.json')
    # Save synthetic data to a CSV file
    generator.save_synthetic_data(f'./data/bayes/synteettinen_bayes_data_{i}.csv')

Adding ROOT num


Adding attribute thalach
Adding attribute oldpeak
Adding attribute restecg
Adding attribute fbs
Adding attribute trestbps
Adding attribute slope
Adding attribute chol
Adding attribute thal
Adding attribute ca
Adding attribute sex
Adding attribute exang
Adding attribute age
Adding attribute cp
Adding ROOT num
Adding attribute thalach
Adding attribute oldpeak
Adding attribute restecg
Adding attribute fbs
Adding attribute trestbps
Adding attribute slope
Adding attribute cp
Adding attribute thal
Adding attribute ca
Adding attribute sex
Adding attribute exang
Adding attribute chol
Adding attribute age
Adding ROOT num
Adding attribute thalach
Adding attribute oldpeak
Adding attribute restecg
Adding attribute fbs
Adding attribute trestbps
Adding attribute exang
Adding attribute chol
Adding attribute thal
Adding attribute ca
Adding attribute sex
Adding attribute slope
Adding attribute age
Adding attribute cp
Adding ROOT num
Adding attribute thalach
Adding attribute oldpeak
Adding attribute res

In [23]:
synth_data_privbayes_list = []

for i in range(kf.get_n_splits()):
    synth_data_privbayes_i = pd.read_csv(f'./datasynthesizer_files/synth_data_{i}.csv', index_col=0)
    synth_data_privbayes_list.append(synth_data_privbayes_i)

Tallennetaan lopuksi syntynyt data:

In [24]:
for i, synth_set in enumerate(synth_data_sdv_TVAE_list):
    synth_set.to_csv(f"./data/tvae/synteettinen_tvae_data_{i}.csv")

for i, synth_set in enumerate(synth_data_sdv_CTGAN_list):
    synth_set.to_csv(f"./data/ctgan/synteettinen_ctgan_data_{i}.csv")

for i, synth_set in enumerate(synth_data_sdv_gcopula_list):
    synth_set.to_csv(f"./data/gcopula/synteettinen_gcopula_data_{i}.csv")

for i, synth_set in enumerate(synth_data_synthcity_nflow_list):
    synth_set.to_csv(f"./data/nflow/synteettinen_nflow_data_{i}.csv")

for i, synth_set in enumerate(synth_data_synthcity_diffusion_list):
    synth_set.to_csv(f"./data/diffusion/synteettinen_diffusion_data_{i}.csv")

for i, train_set in enumerate(train_folds):
    train_set.to_csv(f"./data/train/training_data_{i}.csv")

for i, test_set in enumerate(test_folds):
    test_set.to_csv(f"./data/test/testing_data_{i}.csv")