## Data Synthesis Pipeline

### Install Kubeflow Pipelines module

In [None]:
#Colab
#!pip install kfp==1.8.22
#VSCode
#%pip install kfp==1.8.22

In [12]:
from kfp import dsl
from kfp.compiler import Compiler
from kfp import components as comp
from typing import NamedTuple

In [13]:
BASE_IMAGE  = "kristiantamm/synthguard_public:latest"

## InputHandler Class

### Input data from URL

In [None]:
from synthguard.input_handler import InputHandler
inputHandler = InputHandler()
input_data = inputHandler.load_data_from_url("https://example.com/gun_ownership.csv")


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [22]:

def input_component(output_csv: comp.OutputPath('csv'), mlpipeline_ui_metadata_path: comp.OutputPath()):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.input_handler import InputHandler
    import json
    import pandas as pd
    inputHandler = InputHandler()
    inputHandler.load_data_csv("https://raw.githubusercontent.com/SynthGuard/synthguard-framework/main/pipelines/LAGO/synthetic_datasets/expanded_gun_ownership_dataset.csv")
    inputHandler.data = inputHandler.data.replace([pd.NA, None, '-'], 'None')
    sd.save_to_csv(inputHandler.data, output_csv)

    # Visualize the data
    first_10 = inputHandler.data.head(10)
    table_metadata = {
        'outputs': [{
            'type': 'table',
            'storage': 'inline',
            'format': 'csv',
            'header': list(first_10.columns),
            'source': first_10.to_csv(index=False)
        }]
    }

    with open(mlpipeline_ui_metadata_path, 'w') as f:
        json.dump(table_metadata, f)

# Compiling function into a KFP component
input_component = comp.create_component_from_func(func=input_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## DataPreprocessor Class

In [None]:
from synthguard.data_preprocessor import DataPreprocessor
input_data = sd.load_data_csv(input_csv)
dataPreprocessor = DataPreprocessor(data=input_data)
metadata = dataPreprocessor.extract_metadata()
processed_data = dataPreprocessor.data


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [15]:

def preprocess_component(input_csv: comp.InputPath('csv'), output_json: comp.OutputPath('json')):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.data_preprocessor import DataPreprocessor
    input_data = sd.load_data_csv(input_csv)
    dataPreprocessor = DataPreprocessor(data=input_data)
    metadata = dataPreprocessor.extract_metadata()
    sd.save_metadata(metadata, output_json)
    # Compiling function into a KFP component
preprocess_component = comp.create_component_from_func(func=preprocess_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## SyntheticDataGenerator Class

In [None]:
from synthguard.synthetic_data_generator import SyntheticDataGenerator
syntheticDataGenerator = SyntheticDataGenerator(output_csv=None, n_rows=1000, method="realistic")
generated_data = syntheticDataGenerator.generate_synthetic_data(metadata=metadata, processed_data=processed_data)


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [16]:

def generation_component(n_rows:int, input_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_csv: comp.OutputPath('csv'), mlpipeline_ui_metadata_path: comp.OutputPath()):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.synthetic_data_generator import SyntheticDataGenerator
    from synthguard.generate_personal_data import PersonalFaker
    import json
    processed_data = sd.load_data_csv(input_csv)
    metadata = sd.load_metadata(input_json)
    syntheticDataGenerator = SyntheticDataGenerator(output_csv=None, n_rows=n_rows, method="realistic")
    generated_data = syntheticDataGenerator.generate_synthetic_data(metadata=metadata, processed_data=processed_data)
    # Add Estonian names and surnames
    personal_faker = PersonalFaker()
    for idx, row in generated_data.iterrows():
        if 'Gender' in row and 'Full Name' in row:
            if row['Gender'] == 'Male':
                first_name = personal_faker.first_name_male_est()
                last_name = personal_faker.last_name_est()
            elif row['Gender'] == 'Female':
                first_name = personal_faker.first_name_female_est()
                last_name = personal_faker.last_name_est()
            else:
                first_name = personal_faker.first_name_est()
                last_name = personal_faker.last_name_est()
            generated_data.at[idx, 'Full Name'] = f"{first_name} {last_name}"

    # Add residential addresses
    addresses = personal_faker.generate_local_addresses(len(generated_data))
    generated_data['Residential Address'] = [f"{street}, {city}" for street, city in addresses]

    sd.save_to_csv(generated_data, output_csv)

    #Visualize the generated data
    first_10 = generated_data.head(10)
    table_metadata = {
        'outputs': [{
            'type': 'table',
            'storage': 'inline',
            'format': 'csv',
            'header': list(first_10.columns),
            'source': first_10.to_csv(index=False)
        }]
    }
    with open(mlpipeline_ui_metadata_path, 'w') as f:
        json.dump(table_metadata, f)
    
    # Compiling function into a KFP component
generation_component = comp.create_component_from_func(func=generation_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## Privacy Report Class

In [None]:
from synthguard.privacy_report_generator import PrivacyRiskEvaluator
synthetic_data = sd.load_data_csv(input_synth_csv)
real_data = sd.load_data_csv(input_real_csv)
metadata = sd.load_metadata(input_json)
privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
privacyRiskEvaluator.evaluate_privacy()
privacyRiskEvaluator.visualize_privacy_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [17]:

def privacy_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.privacy_report_generator import PrivacyRiskEvaluator
    from synthguard.data_preprocessor import DataPreprocessor
    synthetic_data = sd.load_data_csv(input_synth_csv)
    real_data = sd.load_data_csv(input_real_csv)
    metadata = sd.load_metadata(input_json)
    privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
    privacyRiskEvaluator.run_privacy_realistic()
    privacyRiskEvaluator.plot_privacy_metrics_realistic()
    privacyRiskEvaluator.save_plot_to_html(output_html)

    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
privacy_report_component = comp.create_component_from_func(func=privacy_report_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## Utility Report Class

In [None]:
from synthguard.utility_report_generator import UtilityEvaluator
synthetic_data = sd.load_data_csv(input_synth_csv)
real_data = sd.load_data_csv(input_real_csv)
metadata = sd.load_metadata(input_json)
utilityEvaluator = utilityEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
utilityEvaluator.evaluate_utility()
utilityEvaluator.visualize_utility_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [18]:

def utility_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.quality_report_generator import DataQualityEvaluator
    synthetic_data = sd.load_data_csv(input_synth_csv)
    real_data = sd.load_data_csv(input_real_csv)
    metadata = sd.load_metadata(input_json)
    qualityEvaluator = DataQualityEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
    qualityEvaluator.evaluate_quality()
    qualityEvaluator.plot_quality_report_realistic()
    qualityEvaluator.save_plot_to_html(output_html)
    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
utility_report_component = comp.create_component_from_func(func=utility_report_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## Diagnostic Report Class

In [None]:
from synthguard.diagnostic_report_generator import DiagnosticEvaluator
diagnosticEvaluator = DiagnosticEvaluator(real_data=processed_data, synthetic_data=generated_data, metadata=metadata)
diagnosticEvaluator.evaluate_diagnostic()
diagnosticEvaluator.visualize_diagnostic_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [19]:

def diagnostic_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.diagnostic_report_generator import DiagnosticEvaluator
    real_data = sd.load_data_csv(input_real_csv)
    synthetic_data = sd.load_data_csv(input_synth_csv)
    metadata = sd.load_metadata(input_json)

    diagnosticEvaluator = DiagnosticEvaluator(real_data=real_data, synthetic_data=synthetic_data, metadata=metadata)
    diagnosticEvaluator.run_diagnostic_realistic()
    diagnosticEvaluator.plot_diagnostic_report_realistic()
    diagnosticEvaluator.save_plot_to_html(output_html)

    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
diagnostic_report_component = comp.create_component_from_func(func=diagnostic_report_component, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## Pipeline

Connect all the modules and compile the pipeline

### Example
```
@dsl.pipeline(name='example_pipeline', description='example_desc')
def pipeline():
	input = input_component()
	preprocess = preprocess_component(input.output)
	generation = generation_component(input.output, preprocess.output)
	dianostic = diagnostic_report_component(input.output, generation.output, preprocess.output)
	utility = utility_report_component(input.output, generation.output, preprocess.output)
	privacy = privacy_report_component(input.output, generation.output, preprocess.output)
```


In [23]:
@dsl.pipeline(name="gun_ownership_pipeline", description="Gun ownership dataset synthetic data pipeline")
def pipeline(n_rows: int):
	input_data = input_component()
	preprocess_task = preprocess_component(input_csv=input_data.output)
	generation_task = generation_component(n_rows, input_data.output, preprocess_task.output)
	privacy_report_task = privacy_report_component(input_synth_csv=generation_task.output, input_real_csv=input_data.output, input_json=preprocess_task.output)
	utility_report_task = utility_report_component(input_synth_csv=generation_task.output, input_real_csv=input_data.output, input_json=preprocess_task.output)
	diagnostic_report_task = diagnostic_report_component(input_synth_csv=generation_task.output, input_real_csv=input_data.output, input_json=preprocess_task.output)

In [24]:
Compiler().compile(pipeline, "gun_ownership_pipeline.yaml")