## Data Synthesis Pipeline

### Install Kubeflow Pipelines module

In [None]:
#Colab
#!pip install kfp==1.8.22
#VSCode
#%pip install kfp==1.8.22

In [None]:
from kfp import dsl
from kfp.compiler import Compiler
from kfp import components as comp
from typing import NamedTuple

## InputHandler Class

### Input data from URL

In [None]:
from synthguard.input_handler import InputHandler
inputHandler = InputHandler()
input_data = inputHandler.load_data_from_url("https://opendata.smit.ee/ppa/csv/valjakutsed.csv")


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [None]:

def input_component(input_csv: comp.InputPath('csv'), output_csv: comp.OutputPath('csv')):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.input_handler import InputHandler
    inputHandler = InputHandler()
    input_data = inputHandler.load_data_from_url("https://opendata.smit.ee/ppa/csv/valjakutsed.csv")
    # Compiling function into a KFP component
input_component = comp.create_component_from_func(func=input_component, base_image=synthguard/synthguard:latest)


## DataPreprocessor Class

In [None]:
from synthguard.data_preprocessor import DataPreprocessor
input_data = sd.load_data_csv(input_csv)
dataPreprocessor = DataPreprocessor(data=input_data)
metadata = dataPreprocessor.extract_metadata()
processed_data = dataPreprocessor.data


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [None]:

def preprocess_component(input_csv: comp.InputPath('csv'), output_csv: comp.OutputPath('csv')):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.data_preprocessor import DataPreprocessor
    input_data = sd.load_data_csv(input_csv)
    dataPreprocessor = DataPreprocessor(data=input_data)
    metadata = dataPreprocessor.extract_metadata()
    processed_data = dataPreprocessor.data
    # Compiling function into a KFP component
preprocess_component = comp.create_component_from_func(func=preprocess_component, base_image=synthguard/synthguard:latest)


## SyntheticDataGenerator Class

In [None]:
from synthguard.synthetic_data_generator import SyntheticDataGenerator
syntheticDataGenerator = SyntheticDataGenerator(output_csv=None, n_rows=1000, method="realistic")
generated_data = syntheticDataGenerator.generate_synthetic_data(metadata=metadata, processed_data=processed_data)


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [None]:

def generation_component(input_csv: comp.InputPath('csv'), output_csv: comp.OutputPath('csv')):
    import synthguard.helper_functions as sd
    #Component logic
    from synthguard.synthetic_data_generator import SyntheticDataGenerator
    syntheticDataGenerator = SyntheticDataGenerator(output_csv=None, n_rows=1000, method="realistic")
    generated_data = syntheticDataGenerator.generate_synthetic_data(metadata=metadata, processed_data=processed_data)
    # Compiling function into a KFP component
generation_component = comp.create_component_from_func(func=generation_component, base_image=synthguard/synthguard:latest)


## Privacy Report Class

In [None]:
from synthguard.privacy_report_generator import PrivacyRiskEvaluator
synthetic_data = sd.load_data_csv(input_synth_csv)
real_data = sd.load_data_csv(input_real_csv)
metadata = sd.load_metadata(input_json)
privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
privacyRiskEvaluator.evaluate_privacy()
privacyRiskEvaluator.visualize_privacy_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [None]:

def privacy_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.privacy_report_generator import PrivacyRiskEvaluator
    synthetic_data = sd.load_data_csv(input_synth_csv)
    real_data = sd.load_data_csv(input_real_csv)
    metadata = sd.load_metadata(input_json)
    privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
    privacyRiskEvaluator.evaluate_privacy()
    privacyRiskEvaluator.visualize_privacy_report()

    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
privacy_report_component = comp.create_component_from_func(func=privacy_report_component, base_image=synthguard/synthguard:latest)


## Utility Report Class

In [None]:
from synthguard.utility_report_generator import UtilityEvaluator
synthetic_data = sd.load_data_csv(input_synth_csv)
real_data = sd.load_data_csv(input_real_csv)
metadata = sd.load_metadata(input_json)
utilityEvaluator = utilityEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
utilityEvaluator.evaluate_utility()
utilityEvaluator.visualize_utility_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [None]:

def utility_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.utility_report_generator import UtilityEvaluator
    synthetic_data = sd.load_data_csv(input_synth_csv)
    real_data = sd.load_data_csv(input_real_csv)
    metadata = sd.load_metadata(input_json)
    utilityEvaluator = utilityEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method="realistic")
    utilityEvaluator.evaluate_utility()
    utilityEvaluator.visualize_utility_report()

    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
utility_report_component = comp.create_component_from_func(func=utility_report_component, base_image=synthguard/synthguard:latest)


## Diagnostic Report Class

In [None]:
from synthguard.diagnostic_report_generator import DiagnosticEvaluator
diagnosticEvaluator = DiagnosticEvaluator(real_data=processed_data, synthetic_data=generated_data, metadata=metadata)
diagnosticEvaluator.evaluate_diagnostic()
diagnosticEvaluator.visualize_diagnostic_report()


Kubeflow Pipelines Visual Component - Adjust function parameters accordingly

In [None]:

def diagnostic_report_component(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.diagnostic_report_generator import DiagnosticEvaluator
    diagnosticEvaluator = DiagnosticEvaluator(real_data=processed_data, synthetic_data=generated_data, metadata=metadata)
    diagnosticEvaluator.evaluate_diagnostic()
    diagnosticEvaluator.visualize_diagnostic_report()

    #Write visualization elements into output_html file

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()
    metadata = {
        'outputs': [{'type': 'web-app', 'storage': 'inline', 'source': html_content}]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

    # Compiling function into a KFP component
diagnostic_report_component = comp.create_component_from_func(func=diagnostic_report_component, base_image=synthguard/synthguard:latest)


## Pipeline

Connect all the modules and compile the pipeline

### Example
```
@dsl.pipeline(name='example_pipeline', description='example_desc')
def pipeline():
	input = input_component()
	preprocess = preprocess_component(input.output)
	generation = generation_component(input.output, preprocess.output)
	dianostic = diagnostic_report_component(input.output, generation.output, preprocess.output)
	utility = utility_report_component(input.output, generation.output, preprocess.output)
	privacy = privacy_report_component(input.output, generation.output, preprocess.output)
```


In [None]:
@dsl.pipeline(name="police_call_pipeline", description="Police & Border Guard (PPA) police call dataset synthetic data pipeline")
def pipeline():
	#TODO
	pass

In [None]:
Compiler().compile(pipeline, "police_call_pipeline.yaml")

## Add ImagePullSecret. Mandatory to run for Kubeflow execution!

In [None]:

import yaml
with open("police_call_pipeline.yaml", "r") as file:
    workflow_yaml = yaml.safe_load(file)
workflow_yaml["spec"]["imagePullSecrets"] = [{"name": "regcred"}]
with open("police_call_pipeline.yaml", "w") as file:
    yaml.dump(workflow_yaml, file, default_flow_style=False)
        