## Data Synthesis Pipeline

### Install Kubeflow Pipelines module

In [1]:
#Colab
#!pip install kfp==1.8.22
#VSCode
#%pip install kfp==1.8.22

In [2]:
from kfp import dsl
from kfp.compiler import Compiler
from kfp import components as comp
from typing import  NamedTuple

Before importing Synthguard library, please install the package in your terminal ```pip install git+https://gitlab.cyber.ee/exai/synthguard```

In [None]:
synthguard_library = ['synthguard repo']

## InputHandler Class

### Existing data from URL

In [3]:
from synthguard.input_handler import InputHandler
inputHandler = InputHandler()
inputHandler.load_data_from_url("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
input_data = inputHandler.data


Data loaded successfully from https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [4]:

def input(output_csv: comp.OutputPath('csv')):
    from synthguard.input_handler import InputHandler
    import synthguard.helper_functions as sd
    inputHandler = InputHandler()
    inputHandler.load_data_from_url("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
    input_data = inputHandler.data
    sd.save_to_csv(input_data, output_csv)

input_component = comp.create_component_from_func(func = input, base_image = 'python:3.10', packages_to_install = synthguard_library)


Error when stripping type annotations: No module named 'lib2to3'


## DataPreprocessor Class

In [10]:
from synthguard.data_preprocessor import DataPreprocessor 
dataPreprocessor = DataPreprocessor(data = input_data)
metadata = dataPreprocessor.extract_metadata()
processed_data = dataPreprocessor.data


In [11]:
print(processed_data)

     sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [5]:

def preprocess(input_csv: comp.InputPath('csv'), output_json: comp.OutputPath('json')):
    from synthguard.data_preprocessor import DataPreprocessor
    import synthguard.helper_functions as sd
    input_data = sd.load_data_csv(input_csv)
    dataPreprocessor = DataPreprocessor(data = input_data)
    metadata = dataPreprocessor.extract_metadata()
    sd.save_metadata(metadata, output_json)

preprocess_component = comp.create_component_from_func(func = preprocess, base_image = 'python:3.10', packages_to_install = synthguard_library)


Error when stripping type annotations: No module named 'lib2to3'


## SyntheticDataGenerator Class

In [7]:
from synthguard.synthetic_data_generator import SyntheticDataGenerator
syntheticDataGenerator = SyntheticDataGenerator(output_csv = None, n_rows = 10000, method= "realistic")
generated_data = syntheticDataGenerator.generate_synthetic_data(metadata = metadata, processed_data = processed_data)


Gen. (1.25) | Discrim. (-0.06): 100%|██████████| 1/1 [00:00<00:00, 11.06it/s]


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [6]:

def generation(input_csv: comp.InputPath('csv'), input_json: comp.InputPath('json'), output_csv: comp.OutputPath('csv')):
    from synthguard.synthetic_data_generator import SyntheticDataGenerator
    import synthguard.helper_functions as sd
    metadata = sd.load_metadata(input_json)
    data = sd.load_data_csv(input_csv)
    syntheticDataGenerator = SyntheticDataGenerator(output_csv = output_csv, n_rows = 10000, method= "realistic")
    generated_data = syntheticDataGenerator.generate_synthetic_data(metadata = metadata, processed_data = data)
    print(generated_data)

generation_component = comp.create_component_from_func(func = generation, base_image = 'python:3.10', packages_to_install = synthguard_library)


Error when stripping type annotations: No module named 'lib2to3'


## Quality Report Class

In [9]:
from synthguard.quality_report_generator import DataQualityEvaluator
dataQualityEvaluator = DataQualityEvaluator(real_data = processed_data, synthetic_data = generated_data, metadata = metadata)
dataQualityEvaluator.evaluate_quality()
dataQualityEvaluator.visualize_quality_report()

ModuleNotFoundError: No module named 'catboost'

Kubeflow Pipelines Component - Adjust function parameters accordingly

In [7]:

def qualityReport(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_metadata: comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    from synthguard.quality_report_generator import DataQualityEvaluator
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    processed_data = sd.load_data_csv(input_real_csv)
    generated_data = sd.load_data_csv(input_synth_csv)
    metadata = sd.load_metadata(input_metadata)

    dataQualityEvaluator = DataQualityEvaluator(real_data = processed_data, synthetic_data = generated_data, metadata = metadata)
    dataQualityEvaluator.evaluate_quality()
    dataQualityEvaluator.save_plot_to_html(output_html)

    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))
    
quality_component = comp.create_component_from_func(func = qualityReport, base_image = 'python:3.10', packages_to_install = synthguard_library)


Error when stripping type annotations: No module named 'lib2to3'


## Privacy Report Class

In [None]:
from synthguard.privacy_report_generator import PrivacyRiskEvaluator
privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = processed_data, synthetic_data = generated_data, metadata = metadata)
#privacyRiskEvaluator.evaluate_privacy()


Kubeflow Pipelines Component - Adjust function parameters accordingly

In [None]:

def component(input_csv: comp.InputPath('csv'), output_csv: comp.OutputPath('csv')):
    from synthguard.privacy_report_generator import PrivacyRiskEvaluator
    privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = processed_data, synthetic_data = generated_data, metadata = metadata)
    #privacyRiskEvaluator.evaluate_privacy()

privacy_component = comp.create_component_from_func(func = component, base_image = 'python:3.10', packages_to_install = synthguard_library)


Error when stripping type annotations: No module named 'lib2to3'


**Pipeline**

Connect all the modules and compile the pipeline

In [8]:
@dsl.pipeline(name='SDG_example_pipeline', description='')
def pipeline():
	input = input_component()
	preprocess = preprocess_component(input.output)
	generation = generation_component(input.output, preprocess.output)
	quality = quality_component(generation.output, input.output, preprocess.output)

In [9]:
Compiler().compile(pipeline, 'demo1-kfp-pipeline.yaml')