In [28]:
import pandas as pd
import warnings
import json
import os

os.chdir("/home/antonia/code/Unlocking-Information/")
warnings.filterwarnings("ignore")

folder_name = 'metadata'

In [29]:
rpad_df = pd.read_excel(r'RPAD_data_small.xlsx', engine='openpyxl')
print(rpad_df.shape)
# print(rpad_df.head(5))

(782, 11)


In [30]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=rpad_df)

In [31]:
str_col = ['Sex', 'Management', 'Severity']
int_col = ['Length_of_Stay', 'Alvarado_Score', 'Paedriatic_Appendicitis_Score']
float_col = ['Age', 'BMI', 'Height', 'Weight']

for col in rpad_df:
    if col in int_col:
        # explain no needed to assign dtypes
        metadata.update_column(
        column_name=col,
        sdtype='numerical',
        computer_representation='Int64')
    elif col in float_col:
        #rpad_df[col] = rpad_df[col].astype('float')
        metadata.update_column(
        column_name=col,
        sdtype='numerical',
        computer_representation='Float')
metadata.update_column(
    column_name='ID',
    sdtype='id',
    regex_format='[0-9]{4}')

In [7]:
with open("metadata_rpad_data.json", 'w') as f:
   json.dump(metadata.to_dict(), f)

#with open("metadata_rpad_data.json") as f:
#    metadata_dict = json.load(f)
#
#metadata = SingleTableMetadata.load_from_dict(metadata_dict)

In [32]:
from sdv.single_table import CTGANSynthesizer

model = CTGANSynthesizer(metadata)
model.fit(rpad_df)

In [34]:
synth_data = model.sample(num_rows=rpad_df.shape[0])
print(synth_data.shape)
#print(synth_data.head(10))
#synth_data.to_csv('synthetic_rpad.csv', index=False)

(782, 11)


In [33]:
#model.save(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")
model = CTGANSynthesizer.load(f"pages/evaluation/{folder_name}/results/rpad_data_model.pkl")

In [11]:
# Diagnostic Report:
from sdmetrics.reports.single_table import DiagnosticReport

dg_report = DiagnosticReport()
dg_report.generate(rpad_df, synth_data, metadata, verbose=True)

dg_report.save(filepath=f"pages/evaluation/{folder_name}/results/diagnostic_report.pkl")

Creating report: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [12]:
# Quality Report
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    rpad_df,
    synth_data,
    metadata
)

quality_report.save(filepath=f"pages/evaluation/{folder_name}/results/quality_report.pkl")

Creating report: 100%|██████████| 4/4 [00:00<00:00, 25.48it/s]



Overall Quality Score: 78.24%

Properties:
Column Shapes: 81.46%
Column Pair Trends: 75.03%


In [13]:
fig = quality_report.get_visualization('Column Shapes')
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_shapes.png")

In [36]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_name='Sex',
    metadata=metadata
)
    
fig.show()
fig.write_image(file = f"pages/evaluation/{folder_name}/images/col_plot_sex.png")


In [38]:
# plot two variables:
from sdv.evaluation.single_table import get_column_pair_plot

fig_categorical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Severity', 'Management'],
    metadata=metadata)
    
fig_categorical.show()
fig_categorical.write_image(file = f"pages/evaluation/{folder_name}/images/Severity_Management_pair_plot.png")

fig_numerical = get_column_pair_plot(
    real_data=rpad_df,
    synthetic_data=synth_data,
    column_names=['Weight', 'Height'],
    metadata=metadata)
    
fig_numerical.show()
fig_numerical.write_image(file = f"pages/evaluation/{folder_name}/images/Weight_Height_pair_plot.png")