In [67]:
import pandas as pd
import warnings
import os

os.chdir("/path/to/your/project/") # insert your your path to the project here
warnings.filterwarnings("ignore")
folder_name = 'tuning'

In [68]:
rpad_df = pd.read_excel(r'data/RPAD_data_small.xlsx', engine='openpyxl')
print(rpad_df.shape)
# print(rpad_df.head(5))

(782, 11)


In [69]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=rpad_df)

In [70]:
from sdv.single_table import CTGANSynthesizer

model = CTGANSynthesizer(metadata, epochs=1500, discriminator_steps=5, discriminator_lr=0.001, batch_size=100)
model.fit(rpad_df) # time taken: 18min

In [72]:
synth_data = model.sample(num_rows=rpad_df.shape[0])
print(synth_data.shape)
#print(synth_data.head(10))
#synth_data.to_csv('synthetic_rpad.csv', index=False)

(782, 11)


In [71]:
#model.save(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")
model = CTGANSynthesizer.load(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")

In [51]:
# Diagnostic Report:
from sdmetrics.reports.single_table import DiagnosticReport

dg_report = DiagnosticReport()
dg_report.generate(rpad_df, synth_data, metadata, verbose=True)

dg_report.save(filepath=f"pages/evaluation/{folder_name}/results/diagnostic_report.pkl")

Creating report: 100%|██████████| 4/4 [00:04<00:00,  1.25s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [52]:
# Quality Report
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    rpad_df,
    synth_data,
    metadata
)

quality_report.save(filepath=f"pages/evaluation/{folder_name}/results/quality_report.pkl")

Creating report: 100%|██████████| 4/4 [00:00<00:00, 22.21it/s]



Overall Quality Score: 87.73%

Properties:
Column Shapes: 89.38%
Column Pair Trends: 86.09%


In [53]:
fig = quality_report.get_visualization('Column Shapes')
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_shapes.png")

In [76]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_name='Height',
    metadata=metadata
)
    
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_plot_height.png")


In [77]:
# plot two variables:
from sdv.evaluation.single_table import get_column_pair_plot

fig_categorical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Severity', 'Management'],
    metadata=metadata)
    
fig_categorical.show()
fig_categorical.write_image(file = f"pages/evaluation/{folder_name}/images/Severity_Management_pair_plot.png")

fig_numerical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Weight', 'Height'],
    metadata=metadata)
    
fig_numerical.show()
fig_numerical.write_image(file = f"pages/evaluation/{folder_name}/images/Weight_Height_pair_plot.png")