<a href="https://colab.research.google.com/github/ShankarChavan/synthetic-data-generation/blob/main/SDV/CTGAN/CTGAN_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install sdv
!pip install ydata-profiling

# CTGAN (Conditional Transform Generative Adverserial Network)

CTGAN uses generative adversarial networks (GANs) to create synthesize data with high fidelity.

###Load Data

In [2]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

### Creating a Synthesizer

In [3]:
from sdv.single_table import CTGANSynthesizer

ctg_synthesizer = CTGANSynthesizer(metadata)
ctg_synthesizer.fit(real_data)

### Parameters to config

In [4]:
ctg_synthesizer.get_parameters()

{'enforce_min_max_values': True,
 'enforce_rounding': True,
 'locales': None,
 'embedding_dim': 128,
 'generator_dim': (256, 256),
 'discriminator_dim': (256, 256),
 'generator_lr': 0.0002,
 'generator_decay': 1e-06,
 'discriminator_lr': 0.0002,
 'discriminator_decay': 1e-06,
 'batch_size': 500,
 'discriminator_steps': 1,
 'log_frequency': True,
 'verbose': False,
 'epochs': 300,
 'pac': 10,
 'cuda': True}

###Generate Synthetic data

In [5]:
ctg_synthetic_data = ctg_synthesizer.sample(num_rows=500)
ctg_synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,True,BASIC,16.43,05 Jan 2020,08 Jan 2021,137.34,"90469 Karla Knolls Apt. 781\nSusanberg, CA 70033",5161033759518983
1,steven59@example.org,False,SUITE,10.57,19 May 2020,23 Dec 2020,87.88,"6108 Carla Ports Apt. 116\nPort Evan, MI 71694",4133047413145475690
2,brandon15@example.net,True,BASIC,35.68,11 May 2020,12 May 2020,191.38,86709 Jeremy Manors Apt. 786\nPort Garychester...,4977328103788
3,humphreyjennifer@example.net,False,BASIC,28.75,27 Dec 2020,15 Nov 2020,201.35,"8906 Bobby Trail\nEast Sandra, NY 43986",3524946844839485
4,joshuabrown@example.net,True,DELUXE,,07 Jan 2021,03 May 2020,396.33,"732 Dennis Lane\nPort Nicholasstad, DE 49786",4446905799576890978


### Evaluating Real vs. Synthetic Data

#### Diagnostics

In [6]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=ctg_synthetic_data,
    metadata=metadata
)

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 9/9 [00:00<00:00, 533.45it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 168.90it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%


#### Quality

In [7]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data,
    ctg_synthetic_data,
    metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 9/9 [00:00<00:00, 728.23it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 36/36 [00:00<00:00, 131.47it/s]

Overall Score: 77.55%

Properties:
- Column Shapes: 79.98%
- Column Pair Trends: 75.13%


### Visualising Data

In [8]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=ctg_synthetic_data,
    column_name='room_type',
    metadata=metadata
)

fig.show()

In [9]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=real_data,
    synthetic_data=ctg_synthetic_data,
    column_names=['room_rate', 'room_type'],
    metadata=metadata
)

fig.show()

# Customize CTGAN default parameters

In [10]:
custom_ctg_synthesizer = CTGANSynthesizer(
    metadata,
    epochs=5000)
custom_ctg_synthesizer.fit(real_data)

### Evaluate quality report

In [11]:
synthetic_data_customized_ctg = custom_ctg_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    real_data,
    synthetic_data_customized_ctg,
    metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 9/9 [00:00<00:00, 869.83it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 36/36 [00:00<00:00, 139.59it/s]

Overall Score: 84.54%

Properties:
- Column Shapes: 85.07%
- Column Pair Trends: 84.0%


### Visualize customize CTGAN generated data

In [12]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data_customized_ctg,
    column_name='room_type',
    metadata=metadata
)

fig.show()

# Generate Full Comparison report between real vs synthetic data

In [13]:
from ydata_profiling import ProfileReport

#data = pd.read_csv("train.csv")
real_data_report = ProfileReport(real_data, title="original_data")

#test_df = pd.read_csv("test.csv")
synthetic_data_report = ProfileReport(synthetic_data_customized_ctg, title="synthetic_data")

comparison_report = real_data_report.compare(synthetic_data_report)
comparison_report.to_file("comparison_CTGAN.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


Try running command: 'pip install --upgrade Pillow' to avoid ValueError



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
#print report
comparison_report



### Save Model as pkl file for future generation of data

In [15]:
custom_ctg_synthesizer.save('custom_ctg_synthesizer.pkl')

custom_ctg_synthesizer = CTGANSynthesizer.load('custom_ctg_synthesizer.pkl')