In [1]:
from kfp import dsl
from kfp.compiler import Compiler
from kfp import components as comp
from typing import NamedTuple

In [2]:
dependencies = ['git+https://mshoush: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@github.com/mshoush/synthguard.git']

In [3]:
BASE_IMAGE = 'gitlab.ext.cyber.ee:5050/exai/synthguard:latest'

## Input

In [4]:
# %load_ext autoreload
# %autoreload 2


# from helper_functions import load_json, handle_nested_data_json, reverse_flatten
# import pandas as pd

# files_to_be_zipped = []

# input_path = "./../../data-synthesis/docs/examples/energy-pilot-teadal/datasets/"
# output_path = 'synthetic_datasets/teadal/'

# file1 = 'sir-min-temp.json'
# file2 = 'sir-max-temp.json'

# real_data1 = handle_nested_data_json(pd.json_normalize(load_json(input_path + file1)))
# real_data2 = handle_nested_data_json(pd.json_normalize(load_json(input_path + file2)))


In [5]:
def sir_input(input_path:str,
          output_min_csv: comp.OutputPath('csv'), 
          output_max_csv: comp.OutputPath('csv'),
          output_json: comp.OutputPath('json')):
    from synthguard.helper_functions import load_json, handle_nested_data_json, save_to_csv, save_json
    import pandas as pd

    file1 = 'sir-min-temp.json'
    file2 = 'sir-max-temp.json'

    real_data1 = handle_nested_data_json(pd.json_normalize(load_json(input_path + file1)))
    real_data2 = handle_nested_data_json(pd.json_normalize(load_json(input_path + file2)))

    save_to_csv(real_data1, output_min_csv)
    save_to_csv(real_data2, output_max_csv)
    save_json(output_json, file1, load_json(input_path + file1))

input_component = comp.create_component_from_func(sir_input, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


## Preprocess

In [6]:
# from data_preprocessor import DataPreprocessor 

# dataPreprocessor1 = DataPreprocessor(data = real_data1)
# processed_data1, metadata1 = dataPreprocessor1.preprocess_data()

# dataPreprocessor2 = DataPreprocessor(data = real_data2)
# processed_data2, metadata2 = dataPreprocessor2.preprocess_data()

In [7]:
def sir_preprocess(input_min_csv: comp.InputPath('csv'), 
               input_max_csv: comp.InputPath('csv'),
               output_min_metadata: comp.OutputPath('json'),
               output_max_metadata: comp.OutputPath('json')):
    from synthguard.data_preprocessor import DataPreprocessor
    from synthguard.helper_functions import save_metadata, load_data_csv

    real_data1 = load_data_csv(input_min_csv)
    real_data2 = load_data_csv(input_max_csv)

    dataPreprocessor1 = DataPreprocessor(data = real_data1)
    processed_data1, metadata1 = dataPreprocessor1.preprocess_data()
    save_metadata(metadata1, output_min_metadata)

    dataPreprocessor2 = DataPreprocessor(data = real_data2)
    processed_data2, metadata2 = dataPreprocessor2.preprocess_data()
    save_metadata(metadata2, output_max_metadata)

preprocess_comp = comp.create_component_from_func(sir_preprocess, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


## Generation

In [8]:
# from synthetic_data_generator import SyntheticDataGenerator


# output1_json = 'sir-min-temp-synthetic.csv'
# output2_json = 'sir-max-temp-synthetic.csv'


# if output1_json:
#     # Create the output path if it does not exist
#     import os
#     if not os.path.exists(output_path):
#         os.makedirs(output_path)
#     output1 = output_path + output1_json


# if output2_json:
#     # Create the output path if it does not exist
#     import os
#     if not os.path.exists(output_path):
#         os.makedirs(output_path)
#     output2 = output_path + output1_json


# N_Rows = 1000
# EPOCHS = 1
# Locales = 'ee_ET'
# synthetic_data_type = 'realistic'

# syntheticDataGenerator1 = SyntheticDataGenerator(locales=Locales, n_rows=N_Rows, method=synthetic_data_type, output_csv=output1)
# generated_data1 = syntheticDataGenerator1.generate_synthetic_data(metadata = metadata1, processed_data = processed_data1, Nepochs=EPOCHS)

# syntheticDataGenerator2 = SyntheticDataGenerator(locales=Locales, n_rows=N_Rows, method=synthetic_data_type, output_csv=output2)
# generated_data2 = syntheticDataGenerator2.generate_synthetic_data(metadata = metadata2, processed_data = processed_data2, Nepochs=EPOCHS)


In [9]:
def sir_generation(n_rows: int,
               input_min_csv: comp.InputPath('csv'),
               input_max_csv: comp.InputPath('csv'),
               input_min_metadata: comp.InputPath('json'),
               input_max_metadata: comp.InputPath('json'),
               output_min_csv:comp.OutputPath('csv'),
               output_max_csv:comp.OutputPath('csv')):
    from synthguard.synthetic_data_generator import SyntheticDataGenerator
    from synthguard.helper_functions import load_metadata, load_data_csv, save_to_csv

    metadata1 = load_metadata(input_min_metadata)
    metadata2 = load_metadata(input_max_metadata)

    processed_data1 =  load_data_csv(input_min_csv)
    processed_data2 = load_data_csv(input_max_csv)

    N_Rows = n_rows
    EPOCHS = 1
    Locales = 'ee_ET'
    synthetic_data_type = 'realistic'

    syntheticDataGenerator1 = SyntheticDataGenerator(locales=Locales, n_rows=N_Rows, method=synthetic_data_type, output_csv=output_min_csv)
    generated_data1 = syntheticDataGenerator1.generate_synthetic_data(metadata = metadata1, processed_data = processed_data1, Nepochs=EPOCHS)

    syntheticDataGenerator2 = SyntheticDataGenerator(locales=Locales, n_rows=N_Rows, method=synthetic_data_type, output_csv=output_max_csv)
    generated_data2 = syntheticDataGenerator2.generate_synthetic_data(metadata = metadata2, processed_data = processed_data2, Nepochs=EPOCHS)

generation_comp = comp.create_component_from_func(sir_generation, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


## Combine

In [10]:
def sir_combine(input_min_csv: comp.InputPath('csv'),
                input_max_csv: comp.InputPath('csv'),
                input_json: comp.InputPath('json'),
                output_combined_json: comp.OutputPath('json')):
    import os
    from synthguard.helper_functions import reverse_flatten, save_json, load_data_csv, load_json

    # Debug input and output paths
    input_json_path = os.path.join(input_json, 'sir-min-temp.json')
    output_combined = 'sir-min-max-temp-synthetic.json'
    
    print(f"input_min_csv path: {input_min_csv}")
    print(f"input_max_csv path: {input_max_csv}")
    print(f"input_json path: {input_json_path}")
    print(f"output_combined_json path: {output_combined_json}")
    
    # Load CSV data and original JSON structure
    generated_data1 = load_data_csv(input_min_csv)
    generated_data2 = load_data_csv(input_max_csv)
    original_json_structure = load_json(input_json_path)

    # Validate that the JSON structure is a dictionary
    if not isinstance(original_json_structure, dict):
        raise ValueError(f"Expected a dictionary from {input_json_path}, but got {type(original_json_structure)}")

    # Reverse-flatten the generated data using the original JSON structure
    rebuilt_data1 = reverse_flatten(generated_data1, original_json_structure)
    rebuilt_data2 = reverse_flatten(generated_data2, original_json_structure)

    # Combine the data
    combined_data = {0: rebuilt_data1, 1: rebuilt_data2}

    # Ensure output directory exists and save the combined JSON
    os.makedirs(os.path.dirname(output_combined_json), exist_ok=True)
    save_json(output_combined_json, output_combined, combined_data)

    print(f"Merged JSON saved to {os.path.join(output_combined_json, output_combined)}")
    
# Define the component
combine_comp = comp.create_component_from_func(sir_combine, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


# Diagnostic Report

In [11]:
# from diagnostic_report_generator import DiagnosticEvaluator

# diagnosticReportGenerator1 = DiagnosticEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
# diagnosticReportGenerator1.run_diagnostic_realistic()
# diagnosticReportGenerator1.plot_diagnostic_report_realistic(output_path = output_path)

# diagnosticReportGenerator2 = DiagnosticEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)
# diagnosticReportGenerator2.run_diagnostic_realistic()
# diagnosticReportGenerator2.plot_diagnostic_report_realistic(output_path = output_path)


In [12]:

def sir_diagnostic(input_min_real_csv: comp.InputPath('csv'),
               input_max_real_csv: comp.InputPath('csv'),
               input_min_synth_csv: comp.InputPath('csv'),
               input_max_synth_csv: comp.InputPath('csv'),
               input_min_metadata: comp.InputPath('json'),
               input_max_metadata: comp.InputPath('json'),
               output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    from synthguard.diagnostic_report_generator import DiagnosticEvaluator
    from synthguard.helper_functions import load_data_csv, load_metadata
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    processed_data1 =  load_data_csv(input_min_real_csv)
    processed_data2 = load_data_csv(input_max_real_csv)

    generated_data1 = load_data_csv(input_min_synth_csv)
    generated_data2 = load_data_csv(input_max_synth_csv)

    metadata1 = load_metadata(input_min_metadata)
    metadata2 = load_metadata(input_max_metadata)

    synthetic_data_type = 'realistic'

    diagnosticReportGenerator1 = DiagnosticEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
    diagnosticReportGenerator1.run_diagnostic_realistic()
    diagnosticReportGenerator1.plot_diagnostic_report_realistic()
    diagnosticReportGenerator1.save_plot_to_html(output_html)

    diagnosticReportGenerator2 = DiagnosticEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)
    diagnosticReportGenerator2.run_diagnostic_realistic()
    diagnosticReportGenerator2.plot_diagnostic_report_realistic()
    diagnosticReportGenerator2.save_plot_to_html(output_html)

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

diagnostic_component = comp.create_component_from_func(sir_diagnostic, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


# Utility Report

In [13]:
# from synthguard.quality_report_generator import DataQualityEvaluator

# dataQualityEvaluator1 = DataQualityEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
# dataQualityEvaluator1.evaluate_quality()
# dataQualityEvaluator1.plot_quality_report_realistic(output_path = output_path)


# dataQualityEvaluator2 = DataQualityEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)
# dataQualityEvaluator2.evaluate_quality()
# dataQualityEvaluator2.plot_quality_report_realistic(output_path = output_path)

In [14]:

def sir_quality(input_min_real_csv: comp.InputPath('csv'),
               input_max_real_csv: comp.InputPath('csv'),
               input_min_synth_csv: comp.InputPath('csv'),
               input_max_synth_csv: comp.InputPath('csv'),
               input_min_metadata: comp.InputPath('json'),
               input_max_metadata: comp.InputPath('json'),
               output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    from synthguard.quality_report_generator import DataQualityEvaluator
    from synthguard.helper_functions import load_data_csv, load_metadata
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    processed_data1 =  load_data_csv(input_min_real_csv)
    processed_data2 = load_data_csv(input_max_real_csv)

    generated_data1 = load_data_csv(input_min_synth_csv)
    generated_data2 = load_data_csv(input_max_synth_csv)

    metadata1 = load_metadata(input_min_metadata)
    metadata2 = load_metadata(input_max_metadata)

    synthetic_data_type = 'realistic'

    dataQualityEvaluator1 = DataQualityEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
    dataQualityEvaluator1.evaluate_quality()
    dataQualityEvaluator1.plot_quality_report_realistic()
    dataQualityEvaluator1.save_plot_to_html(output_html)

    dataQualityEvaluator2 = DataQualityEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)
    dataQualityEvaluator2.evaluate_quality()
    dataQualityEvaluator2.plot_quality_report_realistic()
    dataQualityEvaluator2.save_plot_to_html(output_html)

    # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

quality_comp = comp.create_component_from_func(sir_quality, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


# Privacy Report

In [15]:
# from privacy_report_generator import PrivacyRiskEvaluator

# privacyRiskEvaluator1 = PrivacyRiskEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
# privacyRiskEvaluator1.run_privacy_realistic()
# privacyRiskEvaluator1.plot_privacy_metrics_realistic(output_path = output_path)

# privacyRiskEvaluator2 = PrivacyRiskEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)       
# privacyRiskEvaluator2.run_privacy_realistic()
# privacyRiskEvaluator2.plot_privacy_metrics_realistic(output_path = output_path)

In [16]:
def sir_privacy(input_min_real_csv: comp.InputPath('csv'),
               input_max_real_csv: comp.InputPath('csv'),
               input_min_synth_csv: comp.InputPath('csv'),
               input_max_synth_csv: comp.InputPath('csv'),
               input_min_metadata: comp.InputPath('json'),
               input_max_metadata: comp.InputPath('json'),
               output_html: comp.OutputPath('html')) -> NamedTuple('VisualizationOutput', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    from synthguard.privacy_report_generator import PrivacyRiskEvaluator
    from synthguard.helper_functions import load_data_csv, load_metadata
    import json

    with open(output_html, "w") as f:
        f.write("<html><body>")  # Start the HTML document

    processed_data1 =  load_data_csv(input_min_real_csv)
    processed_data2 = load_data_csv(input_max_real_csv)

    generated_data1 = load_data_csv(input_min_synth_csv)
    generated_data2 = load_data_csv(input_max_synth_csv)

    metadata1 = load_metadata(input_min_metadata)
    metadata2 = load_metadata(input_max_metadata)

    synthetic_data_type = 'realistic'    

    privacyRiskEvaluator1 = PrivacyRiskEvaluator(real_data = processed_data1, synthetic_data = generated_data1, metadata = metadata1, method=synthetic_data_type)
    privacyRiskEvaluator1.run_privacy_realistic()
    privacyRiskEvaluator1.plot_privacy_metrics_realistic()
    privacyRiskEvaluator1.save_plot_to_html(output_html)

    privacyRiskEvaluator2 = PrivacyRiskEvaluator(real_data = processed_data2, synthetic_data = generated_data2, metadata = metadata2, method=synthetic_data_type)       
    privacyRiskEvaluator2.run_privacy_realistic()
    privacyRiskEvaluator2.plot_privacy_metrics_realistic()
    privacyRiskEvaluator2.save_plot_to_html(output_html)
        # Read the HTML content for UI metadata
    with open(output_html, 'r') as file:
        html_content = file.read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }

    from collections import namedtuple
    visualization_output = namedtuple('VisualizationOutput', ['mlpipeline_ui_metadata'])
    return visualization_output(json.dumps(metadata))

privacy_comp = comp.create_component_from_func(sir_privacy, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


# TEADAL half pipeline

In [17]:
dsl.pipeline(name='TEADAL_half_pipeline')
def pipeline(n_rows:int):
    #PVC init
    existing_pvc = dsl.PipelineVolume(pvc='my-pvc')

    input_sir = input_component('/mnt/pvc/').add_pvolumes({"/mnt/pvc": existing_pvc})

    preprocess_sir = preprocess_comp(input_sir.outputs['output_min_csv'], input_sir.outputs['output_max_csv'])

    generation_sir = generation_comp(n_rows,
                                 input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    diagnostic_sir = diagnostic_component(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    utility_sir = quality_comp(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    privacy_sir = privacy_comp(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
Compiler().compile(pipeline, 'teadal_half_pipeline.yaml')

# text files street_names and municipalities

In [18]:
# from faker import Faker
# import pandas as pd
# from generate_personal_data import PersonalFaker
# from helper_functions import read_txt_and_convert_to_df



# # Create an instance of PersonalFaker
# estonian_fake = PersonalFaker("it_IT")

# # input_path = "./../../data-synthesis/docs/examples/energy-pilot-teadal/datasets/"

# input_file1 = "street_names.txt"
# input_file2 = "municipality_codes.txt"




# # Read and truncate files to the same length
# street_names, municipality_codes = (read_txt_and_convert_to_df(input_path, f) for f in [input_file1, input_file2])
# min_rows = min(len(street_names), len(municipality_codes))

# # Create the DataFrame with 'id' and 'address'
# real_data_addresses = pd.DataFrame({
#     'id': range(1, min_rows + 1),
#     'address': street_names.iloc[:min_rows].squeeze() + ", " + municipality_codes.iloc[:min_rows].squeeze().astype(str)
# })

# real_data_addresses



In [19]:
def load_streets_and_municipalities(input_path:str)->dict:
    from synthguard.helper_functions import read_txt_and_convert_to_df

    input_file1 = input_path+"street_names.txt"
    input_file2 = input_path+"municipality_codes.txt"

    street_names, municipality_codes = (read_txt_and_convert_to_df(input_path, f) for f in [input_file1, input_file2])

    print(street_names.head())
    print(municipality_codes.head())
    
    streets_and_municipalities = {
        'streets': street_names.values.tolist(),
        'municipalities': municipality_codes.values.tolist()
    }
    return streets_and_municipalities

load_streets_and_municipalities_comp = comp.create_component_from_func(load_streets_and_municipalities, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


In [20]:
# from data_preprocessor import DataPreprocessor 

# dataPreprocessor = DataPreprocessor(data = real_data_addresses)
# processed_data_addresses, metadata_addresses = dataPreprocessor.preprocess_data()

In [21]:
# from synthguard.generate_personal_data import PersonalFaker
# italian_fake = PersonalFaker("it_IT")

# # Generate 10 addresses
# synthetic_addresses = italian_fake.generate_data_addresses(street_names, municipality_codes, n_addresses=real_data_addresses.shape[0])

# # Print the resulting DataFrame
# print(synthetic_addresses.head())

In [22]:
def generate_addresses(n_rows:int, input_dict: dict, output_csv: comp.OutputPath('csv')):
    from synthguard.generate_personal_data import PersonalFaker
    from synthguard.helper_functions import save_to_csv
    import pandas as pd 

    italian_fake = PersonalFaker("it_IT")
    streets = input_dict['streets']
    municipality_codes = [code[0] for code in input_dict['municipalities']]
    synthetic_addresses = italian_fake.generate_data_addresses(pd.Series(streets), pd.Series(municipality_codes), n_addresses=n_rows)

    print(synthetic_addresses.head())

    save_to_csv(synthetic_addresses, output_csv)

address_generation_component = comp.create_component_from_func(generate_addresses, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


In [23]:
# %load_ext autoreload
# %autoreload 2

# from diagnostic_report_generator import DiagnosticEvaluator

# OutputCSV = 'addresses_synthetic.csv'
# # output_path = 'synthetic_datasets/teadal/'


# if OutputCSV:
#     # Create the output path if it does not exist
#     import os
#     if not os.path.exists(output_path):
#         os.makedirs(output_path)
#     OutputCSV = output_path + OutputCSV
    


# diagnosticReportGenerator = DiagnosticEvaluator(real_data = processed_data_addresses, synthetic_data = synthetic_addresses, metadata = metadata_addresses)
# diagnosticReportGenerator.run_diagnostic_realistic()
# diagnosticReportGenerator.plot_diagnostic_report_realistic(output_path = output_path)

In [24]:
# from quality_report_generator import DataQualityEvaluator

# dataQualityEvaluator = DataQualityEvaluator(real_data = processed_data_addresses, synthetic_data = synthetic_addresses, metadata = metadata_addresses, method=synthetic_data_type)
# dataQualityEvaluator.evaluate_quality()
# dataQualityEvaluator.plot_quality_report_realistic(output_path = output_path)

In [25]:
# from privacy_report_generator import PrivacyRiskEvaluator

# privacyRiskEvaluator = PrivacyRiskEvaluator(real_data = processed_data_addresses, synthetic_data = synthetic_addresses, metadata = metadata_addresses, method=synthetic_data_type)
# privacyRiskEvaluator.run_privacy_realistic()
# privacyRiskEvaluator.plot_privacy_metrics_realistic(output_path = output_path)


# RT-CIT

RT-CIT
RT CIT thermal group:

dimension [cadastre_code] - 10 characters regional code for plant identificationdimension [thermal_unit] - 4 characters thermal group code, ex: GT01


dimension [plant_address] - street, number, building, staircase, ...dimension [municipality] - 6 characters istat municipality code, ex: 50001measure [combustion_efficiency] - double precision value between 0 and 1
Volume:
There are currently 1.851.142 registered plants, 5.595.063 energy efficiency check reports (RCEE) and 3.636 accredited maintenance technicians able to access and update the archive

In [26]:
# import string
# from generate_personal_data import PersonalFaker

# # output_path = 'synthetic_datasets/teadal/'

# # Create an instance of PersonalFaker
# estonian_fake = PersonalFaker("et_EE")

# # Lambda functions to generate cadastre and thermal unit codes
# generate_cadastre_code = lambda: estonian_fake.generate_code(10)
# generate_thermal_unit_code = lambda: "GT" + estonian_fake.generate_code(2, string.digits)

# # Parameters
# n_addresses = synthetic_addresses.shape[0]
# n_reports_per_address = 3


# cadastre_codes_thermal_units = estonian_fake.generate_cadastre_and_thermal_units(n_addresses, n_reports_per_address)

# # generate the rt-cit-thermal-group data
# rt_cit_thermal_group = estonian_fake.generate_rt_cit_thermal_group(synthetic_addresses, cadastre_codes_thermal_units, n_reports_per_address)
# rt_cit_thermal_group.to_csv(output_path + "rt-cit-thermal-group.csv", index=False)


# files_to_be_zipped.append(output_path + "rt-cit-thermal-group.csv")



In [27]:
def rtcit(input_csv: comp.InputPath('csv'), output_csv: comp.OutputPath('csv'), n_reports_per_address:int=3):
    from synthguard.generate_personal_data import PersonalFaker
    from synthguard.helper_functions import load_data_csv

    # Create an instance of PersonalFaker
    italian_fake = PersonalFaker("it_IT")

    synthetic_addresses = load_data_csv(input_csv)

    # Parameters
    n_addresses = synthetic_addresses.shape[0]

    cadastre_codes_thermal_units = italian_fake.generate_cadastre_and_thermal_units(n_addresses, n_reports_per_address)

    # generate the rt-cit-thermal-group data
    rt_cit_thermal_group = italian_fake.generate_rt_cit_thermal_group(synthetic_addresses, cadastre_codes_thermal_units, n_reports_per_address)
    rt_cit_thermal_group.to_csv(output_csv, index=False)

rtcit_component = comp.create_component_from_func(rtcit, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


#  RT-APE

* RT-APE dataset:
    * dimension [address]
    * dimension [municipality]
    * measure [energy_rating]: A4, A3, A2, A1, B, C, D, E, F, G

* Volume: 
    * APE currently contains 420.102 energy performance certificates, 443.142 registered units and 11.214 accredited certifiers

In [28]:

# from generate_personal_data import PersonalFaker
# import os

# # packages_to_install = ["xmlschema"]

# # Create an instance of PersonalFaker
# RT_APE_fake = PersonalFaker()

# # Generate synthetic addresses and municipality codes
# addresses = rt_cit_thermal_group['address'].tolist()
# municipality_codes = rt_cit_thermal_group['municipality_code'].tolist()

# # Define energy ratings and number of certificates
# energy_ratings = ["A4", "A3", "A2", "A1", "B", "C", "D", "E", "F", "G"]
# num_certificates = 200

# # Define file paths for the schema and example XML template
# xsd_file_path = os.path.join(input_path, 'rt-ape-schema.xsd')

# # Define the path to the example XML file
# xml_file_path = os.path.join(input_path, 'rt-ape-example.xml')



# # Generate XML files based on the template and schema and return the zipped files path
# RT_APE_files = RT_APE_fake.generate_xml_files_from_template(input_path, output_path, xsd_file_path, xml_file_path,
#                                          addresses, municipality_codes, num_certificates, energy_ratings,)

# # Add the zip file path to the list
# files_to_be_zipped.append(RT_APE_files)


In [29]:

def rtape(input_path:str, input_csv: comp.InputPath('csv'), output_zip: comp.OutputPath('zip'), num_certificates:int = 200):
    from synthguard.generate_personal_data import PersonalFaker
    import os
    from synthguard.helper_functions import load_data_csv

    # packages_to_install = ["xmlschema"]

    # Create an instance of PersonalFaker
    RT_APE_fake = PersonalFaker()

    rt_cit_thermal_group = load_data_csv(input_csv)

    # Generate synthetic addresses and municipality codes
    addresses = rt_cit_thermal_group['address'].tolist()
    municipality_codes = rt_cit_thermal_group['municipality_code'].tolist()

    # Define energy ratings and number of certificates
    energy_ratings = ["A4", "A3", "A2", "A1", "B", "C", "D", "E", "F", "G"]

    # Define file paths for the schema and example XML template
    xsd_file_path = os.path.join(input_path, 'rt-ape-schema.xsd')

    # Define the path to the example XML file
    xml_file_path = os.path.join(input_path, 'rt-ape-example.xml')



    # Generate XML files based on the template and schema and return the zipped files path
    RT_APE_files = RT_APE_fake.generate_xml_files_from_template(input_path, output_zip, xsd_file_path, xml_file_path,
                                            addresses, municipality_codes, num_certificates, energy_ratings,)

rtape_component = comp.create_component_from_func(rtape, base_image=BASE_IMAGE)



Error when stripping type annotations: No module named 'lib2to3'


# ARPAT

In [30]:

def load_arpat(input_path:str, output_pm10_csv: comp.OutputPath('csv'), output_pm25_csv: comp.OutputPath('csv')):
    from synthguard.helper_functions import load_json, handle_nested_data_json
    import pandas as pd

    arpat_file = 'arpat.json'

    real_arpat = handle_nested_data_json(pd.json_normalize(load_json(input_path +'/'+ arpat_file)))

    pm10_column = 'PM10'
    pm2dot5_column = 'PM2dot5'

    real_arpat_PM10 = real_arpat.drop(columns=[pm2dot5_column])
    real_arpat_PM25 = real_arpat.drop(columns=[pm10_column])

    real_arpat_PM10.to_csv(output_pm10_csv)
    real_arpat_PM25.to_csv(output_pm25_csv)

load_arpat_component = comp.create_component_from_func(load_arpat, base_image=BASE_IMAGE)


Error when stripping type annotations: No module named 'lib2to3'


In [31]:
def arpat_preprocessor(input_pm10_csv:comp.InputPath('csv'), 
                       input_pm25_csv:comp.InputPath('csv'),
                       output_pm10_csv: comp.OutputPath('csv'),
                       output_pm25_csv: comp.OutputPath('csv'),
                       output_pm10_metadata: comp.OutputPath('json'),
                       output_pm25_metadata: comp.OutputPath('json')):
    from synthguard.data_preprocessor import DataPreprocessor
    from synthguard.helper_functions import load_data_csv, save_metadata, save_to_csv

    real_arpat_PM10 = load_data_csv(input_pm10_csv)
    real_arpat_PM25 = load_data_csv(input_pm25_csv)

    pm10_column = 'PM10'
    pm2dot5_column = 'PM2dot5'

    columns_dict_pm10 = {
        "DATA_OSSERVAZIONE": "datetime64[ns]",
        'PM10': 'float64',
        'COMUNE': 'string',
    }

    columns_dict_pm25 = {
        "DATA_OSSERVAZIONE": "datetime64[ns]",
        'PM2dot5': 'float64',
        'COMUNE': 'string',
    }

    columns_to_drop = [pm10_column, pm2dot5_column]


    dataPreprocessor_arpat_PM10 = DataPreprocessor(data = real_arpat_PM10,)
    processed_data_arpat_PM10, metadata_arpat_PM10 = dataPreprocessor_arpat_PM10.preprocess_data(columns_dict=columns_dict_pm10, columns_to_drop=columns_to_drop)

    save_metadata(metadata_arpat_PM10, output_pm10_metadata)
    save_to_csv(processed_data_arpat_PM10, output_pm10_csv)

    dataPreprocessor_arpat_PM25 = DataPreprocessor(data = real_arpat_PM25,)
    processed_data_arpat_PM25, metadata_arpat_PM25 = dataPreprocessor_arpat_PM25.preprocess_data(columns_dict=columns_dict_pm25, columns_to_drop=columns_to_drop)

    save_metadata(metadata_arpat_PM25, output_pm25_metadata)
    save_to_csv(processed_data_arpat_PM25, output_pm25_csv)

arpat_preprocessor_component = comp.create_component_from_func(arpat_preprocessor, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


In [32]:
def arpat25_generation(n_rows:int, input_metadata:comp.InputPath('json'), input_preprocess_csv:comp.InputPath('csv'), output_csv:comp.OutputPath('csv')):
    import pandas as pd
    from synthguard.generate_personal_data import PersonalFaker
    from synthguard.synthetic_data_generator import SyntheticDataGenerator
    from synthguard.helper_functions import save_to_csv, load_metadata, load_data_csv

    N_Rows = n_rows
    output_arpat_json_PM25 = 'arpat-synthetic_PM25.csv'

    # Create an Italian instance of PersonalFaker
    italian_fake = PersonalFaker("it_IT")

    it_municipality_codes = italian_fake.generate_administrative_units(N_Rows)
    it_municipality_codes

    # Generate a list of random dates
    random_dates_list = italian_fake.generate_random_dates(N_Rows, "2023-01-01", "2023-12-31")
    random_dates_list

    synthetic_arpat_PM25_fake = pd.DataFrame({
        'municipality': it_municipality_codes,
        'observation_date': random_dates_list
    })

    processed_data_arpat_PM25 = load_data_csv(input_preprocess_csv)
    metadata_arpat_PM25 = load_metadata(input_metadata)


    synthetic_arpat_PM25 = SyntheticDataGenerator(n_rows=N_Rows, output_csv=output_arpat_json_PM25, method='realistic').generate_synthetic_data(processed_data_arpat_PM25, metadata_arpat_PM25)
    synthetic_arpat_PM25_fake['PM2dot5'] = synthetic_arpat_PM25['PM2dot5']
    print(synthetic_arpat_PM25_fake.head())

    save_to_csv(synthetic_arpat_PM25_fake, output_csv)

arpat25_generation_component = comp.create_component_from_func(arpat25_generation, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


In [33]:
def arpat10_generation(n_rows:int, input_metadata:comp.InputPath('json'), input_preprocess_csv:comp.InputPath('csv'), output_csv:comp.OutputPath('csv')):
    import pandas as pd
    from synthguard.generate_personal_data import PersonalFaker
    from synthguard.helper_functions import save_to_csv, load_metadata, load_data_csv
    from synthguard.synthetic_data_generator import SyntheticDataGenerator

    output_arpat_json_PM10 = 'arpat-synthetic_PM10.csv'

    N_Rows = n_rows

    # Create an Italian instance of PersonalFaker
    italian_fake = PersonalFaker("it_IT")

    it_municipality_codes = italian_fake.generate_administrative_units(N_Rows)
    it_municipality_codes

    # Generate a list of random dates
    random_dates_list = italian_fake.generate_random_dates(N_Rows, "2023-01-01", "2023-12-31")


    synthetic_arpat_PM10_fake = pd.DataFrame({
        'municipality': it_municipality_codes,
        'observation_date': random_dates_list
    })

    processed_data_arpat_PM10 = load_data_csv(input_preprocess_csv)
    metadata_arpat_PM10 = load_metadata(input_metadata)

    synthetic_arpat_PM10 = SyntheticDataGenerator(n_rows=N_Rows, output_csv=output_arpat_json_PM10, method='realistic').generate_synthetic_data(processed_data_arpat_PM10, metadata_arpat_PM10)
    synthetic_arpat_PM10_fake['PM10'] = synthetic_arpat_PM10['PM10']
    synthetic_arpat_PM10_fake

    save_to_csv(synthetic_arpat_PM10_fake, output_csv)

arpat10_generation_component = comp.create_component_from_func(arpat10_generation, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


# BOX2M

In [34]:
# %load_ext autoreload
# %autoreload 2

# from generate_personal_data import PersonalFaker

# # Usage Example
# BOX2M_UNITS = [
#     {"name": "Current L1", "channel": 8, "unit": "A"},
#     {"name": "Current L2", "channel": 10, "unit": "A"},
#     {"name": "Current L3", "channel": 12, "unit": "A"},
#     {"name": "Total Active Power", "channel": 58, "unit": "KW"},
#     {"name": "Total Active Energy Import", "channel": 6688, "unit": "KWh"},
# ]

# box2m_faker = PersonalFaker(locale="it_IT")
# output_box2m_file_path = box2m_faker.generate_box2m_data(
#     n_addresses=5, 
#     output_dir=output_path, 
#     n_records_per_day=3, 
#     n_box2m_records=500,
#     box2m_units=BOX2M_UNITS
# )

# files_to_be_zipped.append(output_box2m_file_path)

In [35]:
def box2m(output_json:comp.OutputPath('json'), n_records:int = 500):
    from synthguard.generate_personal_data import PersonalFaker

    # Usage Example
    BOX2M_UNITS = [
        {"name": "Current L1", "channel": 8, "unit": "A"},
        {"name": "Current L2", "channel": 10, "unit": "A"},
        {"name": "Current L3", "channel": 12, "unit": "A"},
        {"name": "Total Active Power", "channel": 58, "unit": "KW"},
        {"name": "Total Active Energy Import", "channel": 6688, "unit": "KWh"},
    ]

    box2m_faker = PersonalFaker(locale="it_IT")
    output_box2m_file_path = box2m_faker.generate_box2m_data(
        n_addresses=5, 
        output_dir=output_json, 
        n_records_per_day=3, 
        n_box2m_records=n_records,
        box2m_units=BOX2M_UNITS
    )

box2m_component = comp.create_component_from_func(box2m, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


# ZIP

In [36]:
def zip_files(rt_cit: comp.InputPath('csv'), 
              rt_ape: comp.InputPath('zip'), 
              sir_temp: comp.InputPath('json'), 
              arpat25: comp.InputPath('csv'), 
              arpat10: comp.InputPath('csv'),
              box2m: comp.InputPath('json'),
              output: comp.OutputPath('zip')):
    from synthguard.helper_functions import zip_files
    
    files_to_be_zipped = [
        rt_cit,
        rt_ape,
        sir_temp,
        arpat25,
        arpat10,
        box2m
    ]

    zip_files(files_to_be_zipped, output)

zip_files_component = comp.create_component_from_func(zip_files, base_image=BASE_IMAGE)

Error when stripping type annotations: No module named 'lib2to3'


# Pipeline

In [37]:
@dsl.pipeline(name='test_TEADAL_pipeline')
def pipeline(n_rows:int):
    #PVC init
    existing_pvc = dsl.PipelineVolume(pvc='my-pvc')

    input_sir = input_component('/mnt/data/datasets/').add_pvolumes({"/mnt/data/": existing_pvc})

    preprocess_sir = preprocess_comp(input_sir.outputs['output_min_csv'], input_sir.outputs['output_max_csv'])

    generation_sir = generation_comp(n_rows,
                                 input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    sir_combine = combine_comp(generation_sir.outputs['output_min_csv'], generation_sir.outputs['output_max_csv'], input_sir.outputs['output_json'])
    
    diagnostic_sir = diagnostic_component(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    utility_sir = quality_comp(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    privacy_sir = privacy_comp(input_sir.outputs['output_min_csv'], 
                                 input_sir.outputs['output_max_csv'],
                                 generation_sir.outputs['output_min_csv'],
                                 generation_sir.outputs['output_max_csv'],
                                 preprocess_sir.outputs['output_min_metadata'],
                                 preprocess_sir.outputs['output_max_metadata'])
    
    load_streets_and_municipalities = load_streets_and_municipalities_comp('/mnt/data/datasets/').add_pvolumes({"/mnt/data/": existing_pvc})

    address_generation = address_generation_component(n_rows, load_streets_and_municipalities.output)

    rtcit = rtcit_component(address_generation.output)

    rtape = rtape_component('/mnt/data/datasets/', rtcit.output).add_pvolumes({"/mnt/data/": existing_pvc})
    
    load_arpat = load_arpat_component('/mnt/data/datasets').add_pvolumes({"/mnt/data/": existing_pvc})

    arpat_preprocessor = arpat_preprocessor_component(load_arpat.outputs['output_pm10_csv'], load_arpat.outputs['output_pm25_csv'])

    arpat25_generation = arpat25_generation_component(n_rows, arpat_preprocessor.outputs['output_pm25_metadata'], arpat_preprocessor.outputs['output_pm25_csv'])

    arpat10_generation = arpat10_generation_component(n_rows, arpat_preprocessor.outputs['output_pm10_metadata'], arpat_preprocessor.outputs['output_pm10_csv'])

    box2m = box2m_component()

    zip_files = zip_files_component(rtcit.output,
                          rtape.output,
                          sir_combine.outputs['output_combined_json'],
                          arpat25_generation.output,
                          arpat10_generation.output,
                          box2m.output)
        
Compiler().compile(pipeline, 'TEADAL.yaml')

In [38]:
from kfp import Client
from kubernetes import client as k8s_client

In [39]:
pipeline_conf = dsl.PipelineConf()
pipeline_conf.set_image_pull_secrets([k8s_client.V1ObjectReference(name="regcred")])

# Compile the pipeline with the configuration
pipeline_path = 'TEADAL_IMAGE.yaml'
client = Client()
run_result = client.create_run_from_pipeline_func(pipeline, arguments={}, pipeline_conf=pipeline_conf)

Failed to load kube config.


MaxRetryError: HTTPConnectionPool(host='localhost', port=80): Max retries exceeded with url: /apis/v1beta1/healthz (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x72d25a978ef0>: Failed to establish a new connection: [Errno 111] Connection refused'))