# Imports

In [23]:
BASE_IMAGE = 'gitlab.cyber.ee:5050/exai/data-synthesis-workflow-engine:latest'

In [24]:
from kfp import dsl
from kfp.compiler import Compiler
from kfp import components as comp
from typing import  NamedTuple

# Load Data

In [25]:
def input(output_csv: comp.OutputPath('csv')):

	import time
	start_time = time.time()

	import synthguard.helper_functions as sd
	from synthguard.input_handler import InputHandler
	print('Started loading data')
	#Component logic
	inputHandler = InputHandler()
	inputHandler.load_data_csv("https://opendata.smit.ee/ppa/csv/elamislubade_taotlused.csv", n_rows=1000)
	print('Finished loading data')
	input_data = inputHandler.data
	sd.save_to_csv(input_data, output_csv)
	
	end_time = time.time()
	duration = (end_time - start_time) / 60
	print(f"Component completed in {duration:.2f} minutes.")

# Compiling funtion into a KFP component
input_component = comp.create_component_from_func(func = input, base_image = BASE_IMAGE)

# Preprocess Data

In [26]:
def preprocess(input_csv: comp.InputPath('csv'), output_json: comp.OutputPath('json')):
    import time
    start_time = time.time()
    import synthguard.helper_functions as sd
    from synthguard.data_preprocessor import DataPreprocessor

    #Component logic
    input_data = sd.load_data_csv(input_csv)
    dataPreprocessor = DataPreprocessor(data = input_data)

    processed_data, metadata = dataPreprocessor.preprocess_data()
    sd.save_metadata(metadata, output_json)
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"Component completed in {duration:.2f} minutes.")


# Compiling funtion into a KFP component
preprocess_component = comp.create_component_from_func(func = preprocess, base_image = BASE_IMAGE)

# Synthetic Data Generation using CopulaGANSynthesizer

In [27]:
def generation(input_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_csv: comp.OutputPath('csv')):
    import time
    start_time = time.time()
    import synthguard.helper_functions as sd
    from synthguard.synthetic_data_generator import SyntheticDataGenerator

    # Parameters
    n_rows = 1000
    epochs = 32
    locales = 'ee_ET'
    synthetic_data_type = 'realistic'

    #Component logic
    metadata = sd.load_metadata(input_json)
    real_data = sd.load_data_csv(input_csv)

    syntheticDataGenerator = SyntheticDataGenerator(output_csv = output_csv, n_rows = n_rows, method= synthetic_data_type, locales=locales)
    generated_data = syntheticDataGenerator.generate_synthetic_data(metadata = metadata, processed_data = real_data, Nepochs=epochs)
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"Component completed in {duration:.2f} minutes.")

# Compiling funtion into a KFP component
generation_component = comp.create_component_from_func(func = generation, base_image = BASE_IMAGE)


# Diagnostic Report

* The Diagnostic Report runs some basic checks for data format and validity. Run this to ensure that you have created valid synthetic data.
    * **Data Validity:** Basic validity checks for each of the columns. For example, continuous values in the synthetic data must adhere to the min/max range in the real data
    * **Structure:** Checks to ensure the real and synthetic data have the same column names and types. 

In [28]:
def diagnosticReport(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import time
    start_time = time.time()
    
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.diagnostic_report_generator import DiagnosticEvaluator    
    # from diagnostic_report_generator import DiagnosticEvaluator
    real_data = sd.load_data_csv(input_real_csv)
    synthetic_data = sd.load_data_csv(input_synth_csv)
    metadata = sd.load_metadata(input_json)
    synthetic_data_type = 'realistic'

    diagnosticReportGenerator = DiagnosticEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method=synthetic_data_type)
    diagnosticReportGenerator.run_diagnostic_realistic()
    diagnosticReportGenerator.plot_diagnostic_report_realistic()
    diagnosticReportGenerator.save_plot_to_html(output_html)

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"Component completed in {duration:.2f} minutes.")
    
    return visualization_output(json.dumps(metadata))

# Compiling funtion into a KFP component
diagnostic_component = comp.create_component_from_func(func = diagnosticReport, base_image = BASE_IMAGE)


# Utility Report


In [29]:
def qualityReport(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import time
    start_time = time.time()
    
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.quality_report_generator import DataQualityEvaluator
    real_data = sd.load_data_csv(input_real_csv)
    synthetic_data = sd.load_data_csv(input_synth_csv)
    metadata = sd.load_metadata(input_json)
    synthetic_data_type = 'realistic'
    
        
    dataQualityEvaluator = DataQualityEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method=synthetic_data_type)
    dataQualityEvaluator.evaluate_quality()
    dataQualityEvaluator.plot_quality_report_realistic()
    dataQualityEvaluator.save_plot_to_html(output_html)

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"Component completed in {duration:.2f} minutes.")

    return visualization_output(json.dumps(metadata))

# Compiling funtion into a KFP component
utility_component = comp.create_component_from_func(func = qualityReport, base_image = BASE_IMAGE)

# Privacy Report

In [30]:
def privacy(input_synth_csv: comp.InputPath('csv'), input_real_csv: comp.InputPath('csv'), input_json:comp.InputPath('json'), output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    import time
    start_time = time.time()
    import synthguard.helper_functions as sd
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    #Component logic
    from synthguard.privacy_report_generator import PrivacyRiskEvaluator
    real_data = sd.load_data_csv(input_real_csv)
    synthetic_data = sd.load_data_csv(input_synth_csv)
    metadata = sd.load_metadata(input_json)
    synthetic_data_type = 'realistic'
    
    
    privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = real_data, synthetic_data = synthetic_data, metadata = metadata, method=synthetic_data_type)
    privacyRiskEvaluator.run_privacy_realistic()
    privacyRiskEvaluator.plot_privacy_metrics_realistic()
    privacyRiskEvaluator.save_plot_to_html(output_html)

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"Component completed in {duration:.2f} minutes.")

    return visualization_output(json.dumps(metadata))

# Compiling funtion into a KFP component
privacy_component = comp.create_component_from_func(func = privacy, base_image = BASE_IMAGE)

# KFP Pipeline

In [31]:
@dsl.pipeline(name='test_pipeline_1k', description='')
def pipeline():
	input = input_component()
	preprocess = preprocess_component(input.output)
	generation = generation_component(input.output, preprocess.output)
	dianostic = diagnostic_component(input.output, generation.output, preprocess.output)
	utility = utility_component(input.output, generation.output, preprocess.output)
	privacy = privacy_component(input.output, generation.output, preprocess.output)

In [32]:
yamlfile = 'EXP_1K.yaml'
Compiler().compile(pipeline, yamlfile)

In [33]:
import yaml
with open(yamlfile, "r") as file:
    workflow_yaml = yaml.safe_load(file)
workflow_yaml["spec"]["imagePullSecrets"] = [{"name": "regcred"}]
with open(yamlfile, "w") as file:
    yaml.dump(workflow_yaml, file, default_flow_style=False)