In [14]:
import pandas as pd
import warnings
import os
import json

os.chdir("/path/to/your/project/") # insert your your path to the project here
warnings.filterwarnings("ignore")

folder_name = 'default'

In [2]:
rpad_df = pd.read_excel(r'data/RPAD_data_small.xlsx', engine='openpyxl')
print(rpad_df.shape)
# print(rpad_df.head(5))

(782, 11)


In [3]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=rpad_df)

In [None]:
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import evaluate_quality
from ray import tune


# 1. Define an objective function.
def objective(config):
    model = CTGANSynthesizer(config["metadata"], epochs=config["epochs"], 
                             discriminator_steps = config["discriminator_steps"], 
                             discriminator_lr=config["lr_d"])
    model.fit(config["data"])
    synthetic_data = model.sample(config["data"].shape[0])
    score = evaluate_quality(config["data"], synthetic_data, config["metadata"]).get_score()
    return {"score": score}


# 2. Define a search space.
search_space = {
    "epochs": tune.grid_search([300, 700, 1500]),
    "lr_d": tune.choice([0.0002, 0.001, 0.01]),
    "discriminator_steps": tune.grid_search([1, 2, 5]),
    "batch_size": tune.grid_search([100, 200, 500]),
    "metadata": metadata,
    "data": rpad_df
}

# 3. Start a Tune run and print the best result.
tuner = tune.Tuner(
    tune.with_resources(
        objective, resources={"cpu": 10, "gpu": 0.5}
    ), 
    tune_config=tune.TuneConfig(num_samples=10),
    param_space=search_space
)
results = tuner.fit()
#about 2h


In [21]:
results_dict = results.get_best_result(metric="score", mode="max").config
with open("best_hyper_params.json", 'w') as f:
   json.dump({'epochs': 1500, 'lr_d': 0.001, 'discriminator_steps': 5, 'batch_size': 100}, f)
   
print(results.get_best_result(metric="score", mode="max").config)

1500
0.001
5
100
{'epochs': 1500, 'lr_d': 0.001, 'discriminator_steps': 5, 'batch_size': 100, 'metadata': {
    "columns": {
        "ID": {
            "sdtype": "numerical"
        },
        "Age": {
            "sdtype": "numerical"
        },
        "BMI": {
            "sdtype": "numerical"
        },
        "Sex": {
            "sdtype": "categorical"
        },
        "Height": {
            "sdtype": "numerical"
        },
        "Weight": {
            "sdtype": "numerical"
        },
        "Length_of_Stay": {
            "sdtype": "numerical"
        },
        "Management": {
            "sdtype": "categorical"
        },
        "Severity": {
            "sdtype": "categorical"
        },
        "Alvarado_Score": {
            "sdtype": "numerical"
        },
        "Paedriatic_Appendicitis_Score": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}, 'data':        ID        Age        BMI     Sex  Height  Weight  L

In [5]:
synth_data = model.sample(num_rows=rpad_df.shape[0])
print(synth_data.shape)
#print(synth_data.head(10))
#synth_data.to_csv('synthetic_rpad.csv', index=False)

(782, 11)


In [6]:
model.save(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")
#model = CTGANSynthesizer.load(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")

In [7]:
# Diagnostic Report:
from sdmetrics.reports.single_table import DiagnosticReport

dg_report = DiagnosticReport()
dg_report.generate(rpad_df, synth_data, metadata, verbose=True)

dg_report.save(filepath=f"pages/evaluation/{folder_name}/results/diagnostic_report.pkl")

Creating report: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [8]:
# Quality Report
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    rpad_df,
    synth_data,
    metadata
)get_score

quality_report.save(filepath=f"pages/evaluation/{folder_name}/results/quality_report.pkl")

Creating report: 100%|██████████| 4/4 [00:00<00:00, 22.65it/s]



Overall Quality Score: 81.22%

Properties:
Column Shapes: 83.32%
Column Pair Trends: 79.12%


In [9]:
fig = quality_report.get_visualization('Column Shapes')
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_shapes.png")

In [10]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_name='Length_of_Stay',
    metadata=metadata
)
    
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_plot_len_stay.png")


In [11]:
# plot two variables:
from sdv.evaluation.single_table import get_column_pair_plot

fig_categorical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Severity', 'Management'],
    metadata=metadata)
    
fig_categorical.show()
fig_categorical.write_image(file = f"pages/evaluation/{folder_name}/images/Severity_Management_pair_plot.png")

fig_numerical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Age', 'Height'],
    metadata=metadata)
    
fig_numerical.show()
fig_categorical.write_image(file = f"pages/evaluation/{folder_name}/images/Age_Height_pair_plot.png")