In [1]:
from google.colab import files

# Upload locally downloaded file 'breast cancer'
uploaded = files.upload()

Saving breast_cancer.csv to breast_cancer.csv


In [2]:
!pip install sdv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np
import time
from matplotlib import pyplot as plt

from sdv.tabular import TVAE
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate

import seaborn as sns
from sdv.metrics.tabular import NumericalRadiusNearestNeighbor as NRNN

import warnings
warnings.filterwarnings('ignore')

In [14]:
batch_sizes = [50, 100]
epochs = [100, 500, 5000]

gan_models = []
vae_models = []
#copula_models = []

MODEL_TVAE = "TVAE"
MODEL_CTGAN = "CTGAN"

In [15]:
#metrics

#ml_metrics = ['MulticlassEfficacyMetric', "MulticlassDecisionTreeClassifier", "MulticlassMLPClassifier"]

statistical_metrics = ['CSTest', 'KSComplement', 'ContinuousKLDivergence', 'DiscreteKLDivergence']
privacy_metrics = ["NumericalRadiusNearestNeighbor"]

all_metrics = statistical_metrics + privacy_metrics

synthetic_data_mapping = {}
key_list = []

In [6]:
def get_key(batch, epoch, model):
    return model + ': Batch size - ' + str(batch)+ ", Epochs - "+ str(epoch)

In [16]:
for batch in batch_sizes:
    for epoch in epochs:
        gan_models.append(CTGAN(batch_size=batch, verbose=True, epochs=epoch, primary_key="id"))
        vae_models.append(TVAE(batch_size=batch, epochs=epoch, primary_key="id"))
        key_list.append(get_key(batch=batch, epoch=epoch, model=MODEL_TVAE))
        key_list.append(get_key(batch=batch, epoch=epoch, model=MODEL_CTGAN))
        synthetic_data_mapping[get_key(batch=batch, epoch=epoch, model=MODEL_TVAE)] = pd.DataFrame()
        synthetic_data_mapping[get_key(batch=batch, epoch=epoch, model=MODEL_CTGAN)] = pd.DataFrame()

In [8]:
df = pd.read_csv("breast_cancer.csv")
df.name = "breast_cancer"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [9]:
df.drop(columns=["Unnamed: 32"], inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [17]:
def evaluate_model(model, key, metrics, df):
    
    print('Training in Progress - ' + key + '\n')
    
    # Record training time
    start = time.time()
    model.fit(df)
    end = time.time()
    
    print( '\n' + model.__class__.__name__ + ' trained. \nTraining time: ' + str(end-start) + ' seconds \n')
    syn_data = model.sample(len(df))
    #syn_data.name = df.name + '-' + model.__class__.__name__
    syn_data.name = key

    # Save Generated Synthetic Data for each model in a dictionary 
    synthetic_data_mapping[key] = syn_data   
    
    # Record evaluation time
    start = time.time()
    ee = evaluate(syn_data, df, metrics=metrics , aggregate=False)
    end = time.time()
    print("Synthetic Data Evaluation - " + key + '\n')
    display(ee)
    print('\nEvaluation time: ' + str(end-start) + ' seconds \n')
    
    
    # Save the model
    #saved_model_name = model.__class__.__name__ + '_' + df.name + '.pkl'
    #model.save(saved_model_name)
    #saved_models[saved_model_name] = model

In [12]:
key_list

['TVAE: Batch size - 50, Epochs - 100',
 'CTGAN: Batch size - 50, Epochs - 100',
 'TVAE: Batch size - 50, Epochs - 500',
 'CTGAN: Batch size - 50, Epochs - 500',
 'TVAE: Batch size - 50, Epochs - 5000',
 'CTGAN: Batch size - 50, Epochs - 5000',
 'TVAE: Batch size - 100, Epochs - 100',
 'CTGAN: Batch size - 100, Epochs - 100',
 'TVAE: Batch size - 100, Epochs - 500',
 'CTGAN: Batch size - 100, Epochs - 500',
 'TVAE: Batch size - 100, Epochs - 5000',
 'CTGAN: Batch size - 100, Epochs - 5000']

In [18]:
i=0
j=0
for key in key_list:
    if MODEL_TVAE in key:
        evaluate_model(model= vae_models[i],
        key=key,
        metrics=all_metrics,
        df= df)
        i = i + 1
    else:
        evaluate_model(model= gan_models[j],
        key=key,
        metrics=all_metrics,
        df= df)
        j = j+1


Training in Progress - TVAE: Batch size - 50, Epochs - 100


TVAE trained. 
Training time: 28.277936697006226 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 50, Epochs - 100



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.953622,0.953622,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.844492,0.844492,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.697478,0.697478,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.9484953880310059 seconds 

Training in Progress - CTGAN: Batch size - 50, Epochs - 100

Epoch 1, Loss G:  0.7814,Loss D: -0.2348
Epoch 2, Loss G:  0.6251,Loss D: -0.5563
Epoch 3, Loss G:  0.2411,Loss D: -0.8390
Epoch 4, Loss G: -0.9842,Loss D:  0.3416
Epoch 5, Loss G: -1.3123,Loss D:  0.2081
Epoch 6, Loss G: -1.0697,Loss D: -0.2150
Epoch 7, Loss G: -0.6817,Loss D:  0.5607
Epoch 8, Loss G: -0.2890,Loss D:  0.2014
Epoch 9, Loss G:  0.1790,Loss D: -0.6294
Epoch 10, Loss G: -0.3485,Loss D:  0.2102
Epoch 11, Loss G: -0.4807,Loss D:  0.3016
Epoch 12, Loss G: -0.5189,Loss D: -0.3353
Epoch 13, Loss G: -1.1379,Loss D:  0.3831
Epoch 14, Loss G: -0.6763,Loss D: -0.0651
Epoch 15, Loss G: -0.4308,Loss D: -0.2785
Epoch 16, Loss G: -1.7904,Loss D:  0.3022
Epoch 17, Loss G: -1.2867,Loss D: -0.1752
Epoch 18, Loss G: -2.2805,Loss D:  0.1641
Epoch 19, Loss G: -1.8993,Loss D: -0.5193
Epoch 20, Loss G: -1.5983,Loss D:  0.0901
Epoch 21, Loss G: -2.1480,Loss D: -0.3542
Epoch 22, Loss G: -

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.991299,0.991299,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.671183,0.671183,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.364236,0.364236,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.6518609523773193 seconds 

Training in Progress - TVAE: Batch size - 50, Epochs - 500


TVAE trained. 
Training time: 96.68359088897705 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 50, Epochs - 500



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.953622,0.953622,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.887239,0.887239,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.694817,0.694817,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.627356767654419 seconds 

Training in Progress - CTGAN: Batch size - 50, Epochs - 500

Epoch 1, Loss G:  0.7564,Loss D: -0.1861
Epoch 2, Loss G:  0.5155,Loss D: -0.4957
Epoch 3, Loss G:  0.1061,Loss D: -0.0219
Epoch 4, Loss G: -0.3608,Loss D: -0.7455
Epoch 5, Loss G: -1.4451,Loss D:  0.5661
Epoch 6, Loss G: -1.1834,Loss D: -0.0679
Epoch 7, Loss G: -1.1850,Loss D:  0.4097
Epoch 8, Loss G: -0.5748,Loss D:  0.0614
Epoch 9, Loss G: -0.3316,Loss D:  0.0140
Epoch 10, Loss G: -0.5900,Loss D: -0.2824
Epoch 11, Loss G: -0.6981,Loss D:  0.8092
Epoch 12, Loss G: -0.3729,Loss D: -0.0183
Epoch 13, Loss G: -0.6802,Loss D:  0.1136
Epoch 14, Loss G: -0.7402,Loss D:  0.5056
Epoch 15, Loss G: -0.9770,Loss D: -0.2397
Epoch 16, Loss G: -1.0355,Loss D:  0.5879
Epoch 17, Loss G: -1.4252,Loss D:  0.6124
Epoch 18, Loss G: -0.7072,Loss D:  0.4588
Epoch 19, Loss G: -1.6245,Loss D:  1.0601
Epoch 20, Loss G: -1.4793,Loss D: -0.2908
Epoch 21, Loss G: -1.2559,Loss D:  0.0161
Epoch 22, Loss G: -1

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.947832,0.947832,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.706729,0.706729,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.389121,0.389121,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.6930127143859863 seconds 

Training in Progress - TVAE: Batch size - 50, Epochs - 5000


TVAE trained. 
Training time: 1362.9436593055725 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 50, Epochs - 5000



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.944939,0.944939,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.881966,0.881966,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.700066,0.700066,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 7, Loss G: -0.8819,Loss D: -0.2845
Epoch 8, Loss G: -0.8602,Loss D:  0.4319
Epoch 9, Loss G: -0.4500,Loss D:  0.2786
Epoch 10, Loss G: -1.2552,Loss D:  0.0535
Epoch 11, Loss G: -0.9817,Loss D:  0.1615
Epoch 12, Loss G: -1.0734,Loss D:  0.7600
Epoch 13, Loss G: -0.5409,Loss D: -0.2245
Epoch 14, Loss G: -1.1017,Loss D: -0.3848
Epoch 15, Loss G: -1.6720,Loss D: -0.1874
Epoch 16, Loss G: -1.0440,Loss D: -0.0234
Epoch 17, Loss G: -1.5969,Loss D:  0.1245
Epoch 18, Loss G: -1.4697,Loss D:  0.3508
Epoch 19, Loss G: -0.7352,Loss D: -0.5627
Epoch 20, Loss G: -1.1872,Loss D: -0.1728
Epoch 21, Loss G: -0.5874,Loss D: -0.1063
Epoch 22, Loss G: -1.3531,Loss D: -0.6282
Epoch 23, Loss G: -1.6203,Loss D:  0.4525
Epoch 24, Loss G: -1.3952,Loss D:  0.6139
Epoch 25, Loss G: -1.6584,Loss D: -0.1867
Epoch 26, Loss G: -1.3233,Loss D: -0.1916
Epoch 27, Loss G: -1.1524,Loss D: -0.2672
Epoch 28, Loss G: -2.1399,Loss D: -0.2007
Epoch 29, Loss

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.9942,0.9942,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.863314,0.863314,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.559741,0.559741,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.616140365600586 seconds 

Training in Progress - TVAE: Batch size - 100, Epochs - 100


TVAE trained. 
Training time: 19.880390167236328 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 100, Epochs - 100



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.852929,0.852929,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.821475,0.821475,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.710529,0.710529,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.6096055507659912 seconds 

Training in Progress - CTGAN: Batch size - 100, Epochs - 100

Epoch 1, Loss G:  0.8033,Loss D: -0.0296
Epoch 2, Loss G:  0.8704,Loss D: -0.2366
Epoch 3, Loss G:  0.6474,Loss D: -0.3785
Epoch 4, Loss G:  0.3866,Loss D: -0.7880
Epoch 5, Loss G: -0.0890,Loss D: -0.7587
Epoch 6, Loss G: -0.3084,Loss D: -1.1066
Epoch 7, Loss G: -0.8556,Loss D: -0.1192
Epoch 8, Loss G: -1.3341,Loss D:  0.0064
Epoch 9, Loss G: -0.8206,Loss D: -0.0915
Epoch 10, Loss G: -1.4127,Loss D:  0.0988
Epoch 11, Loss G: -1.1018,Loss D:  0.1059
Epoch 12, Loss G: -1.2446,Loss D:  0.1889
Epoch 13, Loss G: -1.0819,Loss D:  0.4851
Epoch 14, Loss G: -1.2637,Loss D:  0.2736
Epoch 15, Loss G: -1.4344,Loss D:  0.1661
Epoch 16, Loss G: -1.2406,Loss D:  0.5475
Epoch 17, Loss G: -0.9439,Loss D:  0.1451
Epoch 18, Loss G: -0.5449,Loss D: -0.4177
Epoch 19, Loss G: -0.4239,Loss D: -0.4709
Epoch 20, Loss G: -0.8103,Loss D: -0.0531
Epoch 21, Loss G: -0.8129,Loss D: -0.0294
Epoch 22, Loss G: 

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.976801,0.976801,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.666194,0.666194,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.36504,0.36504,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.609790563583374 seconds 

Training in Progress - TVAE: Batch size - 100, Epochs - 500


TVAE trained. 
Training time: 61.73712205886841 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 100, Epochs - 500



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.9797,0.9797,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.894042,0.894042,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.706662,0.706662,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.6318624019622803 seconds 

Training in Progress - CTGAN: Batch size - 100, Epochs - 500

Epoch 1, Loss G:  0.7487,Loss D: -0.1209
Epoch 2, Loss G:  0.5815,Loss D: -0.2610
Epoch 3, Loss G:  0.4719,Loss D: -0.5064
Epoch 4, Loss G:  0.1047,Loss D: -0.6294
Epoch 5, Loss G: -0.4712,Loss D: -0.4352
Epoch 6, Loss G: -0.8569,Loss D: -0.2871
Epoch 7, Loss G: -0.7848,Loss D: -0.0556
Epoch 8, Loss G: -1.1953,Loss D: -0.0472
Epoch 9, Loss G: -1.1261,Loss D:  0.2106
Epoch 10, Loss G: -1.4807,Loss D: -0.2301
Epoch 11, Loss G: -1.5590,Loss D:  0.6454
Epoch 12, Loss G: -0.9492,Loss D: -0.1328
Epoch 13, Loss G: -0.7021,Loss D: -0.0183
Epoch 14, Loss G: -0.9436,Loss D: -0.1868
Epoch 15, Loss G: -0.8436,Loss D: -0.2371
Epoch 16, Loss G: -1.0039,Loss D:  0.2429
Epoch 17, Loss G: -1.0705,Loss D:  0.5452
Epoch 18, Loss G: -0.7451,Loss D:  0.4191
Epoch 19, Loss G: -0.9706,Loss D: -0.0214
Epoch 20, Loss G: -0.7253,Loss D:  0.0050
Epoch 21, Loss G: -1.3722,Loss D:  0.3686
Epoch 22, Loss G: 

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.976801,0.976801,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.719315,0.719315,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.359388,0.359388,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.7115159034729004 seconds 

Training in Progress - TVAE: Batch size - 100, Epochs - 5000


TVAE trained. 
Training time: 717.6540017127991 seconds 

Synthetic Data Evaluation - TVAE: Batch size - 100, Epochs - 5000



Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.9971,0.9971,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.884914,0.884914,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.687059,0.687059,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 7, Loss G: -1.0489,Loss D: -0.3671
Epoch 8, Loss G: -1.2779,Loss D:  0.3399
Epoch 9, Loss G: -1.4765,Loss D:  0.4509
Epoch 10, Loss G: -1.0443,Loss D:  0.4304
Epoch 11, Loss G: -1.0297,Loss D: -0.0505
Epoch 12, Loss G: -0.5909,Loss D:  0.1179
Epoch 13, Loss G: -0.6668,Loss D:  0.2804
Epoch 14, Loss G: -0.5408,Loss D:  0.0828
Epoch 15, Loss G: -0.5840,Loss D:  0.1087
Epoch 16, Loss G: -1.0806,Loss D:  0.0386
Epoch 17, Loss G: -1.2769,Loss D:  0.1298
Epoch 18, Loss G: -0.9666,Loss D:  0.0034
Epoch 19, Loss G: -0.6830,Loss D:  0.2307
Epoch 20, Loss G: -0.4591,Loss D:  0.1986
Epoch 21, Loss G: -0.7147,Loss D: -0.4337
Epoch 22, Loss G: -0.7562,Loss D: -0.5008
Epoch 23, Loss G: -0.9856,Loss D:  0.0828
Epoch 24, Loss G: -0.8640,Loss D: -0.3335
Epoch 25, Loss G: -1.4014,Loss D: -0.4420
Epoch 26, Loss G: -1.2752,Loss D:  0.1284
Epoch 27, Loss G: -1.4020,Loss D: -0.0038
Epoch 28, Loss G: -1.2325,Loss D:  0.4397
Epoch 29, Loss

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.988399,0.988399,0.0,1.0,MAXIMIZE,
1,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.850728,0.850728,0.0,1.0,MAXIMIZE,
2,ContinuousKLDivergence,Continuous Kullback–Leibler Divergence,0.544663,0.544663,0.0,1.0,MAXIMIZE,
3,DiscreteKLDivergence,Discrete Kullback–Leibler Divergence,,,0.0,1.0,MAXIMIZE,
4,NumericalRadiusNearestNeighbor,Numerical Radius Nearest Neighbor,,,0.0,inf,MAXIMIZE,`key_fields` must be passed either directly or...



Evaluation time: 1.6353440284729004 seconds 



In [None]:
g = sns.PairGrid(df, hue="diagnosis")
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()
g.fig.suptitle('Real Data Distribution', verticalalignment='baseline', weight="bold")

In [None]:
def generate_corr_heatmap(real, syn, title):
        
    # Correlation Matrix for each feature
    fig, ax = plt.subplots(1, 2, figsize=(15,7))
    corrmat = round(real.corr(), 2)
    top_corr_features = corrmat.index
    #plot heat map
    sns.heatmap(real[top_corr_features].corr(), ax=ax[0], annot=False,fmt='.2f',cmap="BuGn").set(title="Real")

    corrmat = round(syn.corr(), 2)
    top_corr_features = corrmat.index
    #plot heat map
    sns.heatmap(syn[top_corr_features].corr(), ax=ax[1], annot=False,fmt='.2f',cmap="BuGn").set(title="Synthetic")
    fig.suptitle(title, verticalalignment='baseline', weight="bold")
    return fig

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Generate report
i = 1
for key in key_list:

    if(not synthetic_data_mapping[key].empty):
        # Plot univariate and bivariate correlations
        g = sns.PairGrid(synthetic_data_mapping[key], hue="diagnosis")
        g.map_diag(sns.kdeplot)
        g.map_offdiag(sns.scatterplot)
        g.add_legend()
        g.fig.suptitle('Synthetic Data Distribution: ' + key, verticalalignment='baseline', weight="bold")

        # Plot heatmap
        generate_corr_heatmap(df, synthetic_data_mapping[key], title=key)

        if i%2 == 0:
        # Plot Target
            fig, (ax0, ax1, ax2) = plt.subplots(nrows=1, ncols=3, figsize=(18,6))
            sns.histplot(data=df,ax=ax0, x='diagnosis', color="blue", label='Real').set(title='Real')
            sns.histplot(data=synthetic_data_mapping[key_list[key_list.index(key)-1]],ax=ax1, x='diagnosis', color="red", label='TVAE').set(title='TVAE')
            sns.histplot(data=synthetic_data_mapping[key],ax=ax2, x='diagnosis', color="green", label='CTGAN').set(title='CTGAN')
            plt.show()

        i = i+1
        