In [8]:
import pandas as pd
import warnings
import json
import os

os.chdir("/path/to/your/project/") # insert your your path to the project here
warnings.filterwarnings("ignore")

folder_name = 'best_setting'

In [9]:
rpad_df = pd.read_excel(r'data/RPAD_data_small.xlsx', engine='openpyxl')
print(rpad_df.shape)

(782, 11)


In [10]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=rpad_df)

In [11]:
with open("ressources/metadata_rpad_data.json") as f:
    metadata_dict = json.load(f)

metadata = SingleTableMetadata.load_from_dict(metadata_dict)

In [12]:
from sdv.single_table import CTGANSynthesizer

model = CTGANSynthesizer(metadata, epochs=1500, discriminator_steps=5, discriminator_lr=0.001, batch_size=100)

In [13]:
def calculate_bmi(weight, height):
   return weight/(height/100)**2

rpad_df.drop(rpad_df[round(rpad_df['BMI'],0) != round(calculate_bmi(rpad_df['Weight'], rpad_df['Height']),0)].index, inplace=True)
print(rpad_df.shape)

(729, 11)


In [14]:
# load the constraint from the file
model.load_custom_constraint_classes(
    filepath='/home/antonia/code/Unlocking-Information/utils/example_custom_constraint.py',
    class_names=['BMI_Formulae']
)

constraint = {
    'constraint_class': 'BMI_Formulae',
    'constraint_parameters': {
        'column_names': ['BMI', 'Weight', 'Height']
    }
}

In [15]:
def get_positive_constraint(column_name, strict):
    return {
       'constraint_class': 'Positive',
       'constraint_parameters': {
           'column_name': column_name,
           'strict_boundaries': strict
       }
       }
positive_bmi = get_positive_constraint('BMI', True)
positive_weight = get_positive_constraint('Weight', True)
positive_height = get_positive_constraint('Height', True)
positive_los = get_positive_constraint('Length_of_Stay', True)
positive_age = get_positive_constraint('Age', False)
positive_a_score = get_positive_constraint('Alvarado_Score', False)
positive_pa_score = get_positive_constraint('Paedriatic_Appendicitis_Score', False)

In [16]:
model.add_constraints([constraint, positive_bmi, positive_weight, positive_height, positive_los, positive_age, positive_a_score, positive_pa_score])

In [17]:

#model.fit(rpad_df)
#model.save(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")
model = CTGANSynthesizer.load(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")

In [18]:
synth_data = model.sample(num_rows=rpad_df.shape[0])
print(synth_data.shape)
#print(synth_data.head(10))
#synth_data.to_csv('synthetic_rpad.csv', index=False)

Sampling rows: 100%|██████████| 729/729 [00:02<00:00, 290.74it/s]

(729, 11)





In [19]:
# Diagnostic Report:
from sdmetrics.reports.single_table import DiagnosticReport

dg_report = DiagnosticReport()
dg_report.generate(rpad_df, synth_data, metadata, verbose=True)

dg_report.save(filepath=f"pages/evaluation/{folder_name}/results/diagnostic_report.pkl")

Creating report: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [20]:
# Quality Report
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    rpad_df,
    synth_data,
    metadata
)

quality_report.save(filepath=f"pages/evaluation/{folder_name}/results/quality_report.pkl")

Creating report: 100%|██████████| 4/4 [00:00<00:00, 26.07it/s]



Overall Quality Score: 89.82%

Properties:
Column Shapes: 90.79%
Column Pair Trends: 88.84%


In [21]:
fig = quality_report.get_visualization('Column Shapes')
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_shapes.png")

In [22]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_name='Sex',
    metadata=metadata
)
    
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_plot_sex.png")


In [23]:
# plot two variables:
from sdv.evaluation.single_table import get_column_pair_plot

fig_categorical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Severity', 'Management'],
    metadata=metadata)
    
fig_categorical.show()
fig_categorical.write_image(file = f"pages/evaluation/{folder_name}/images/Severity_Management_pair_plot.png")

fig_numerical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Weight', 'Height'],
    metadata=metadata)
    
fig_numerical.show()
fig_numerical.write_image(file = f"pages/evaluation/{folder_name}/images/Weight_Height_pair_plot.png")