# This is KubeFlow pipeline Auto Generator

Below is the implementation of a pipeline autogenerate based on a config.yaml file. 

In [140]:
#from numpy import number
#!pip install kfp pandas pyyaml
#!pip install --upgrade kfp
#!pip install kfp==1.8.22 pandas pyyaml

In [141]:
import gzip
from dbm import error

import kfp
import pandas as pd
from kfp import dsl
from kfp import compiler
import hashlib
from typing import List, Dict, Any
from kfp.dsl import Input, Output, Dataset
import yaml


In [142]:
# stages' function definitions
# You can add the stages of your interest here, also add the mapping for them to the "create_component_for_stage()" as well.

# filtering 
@dsl.component(
    base_image="python:3.12.2",
    packages_to_install=['pandas','requests']
)
def filtering(data_path:str, operation: str, column_name: str, threshold: int, output_json: dsl.Output[Dataset]):
    
    import pandas as pd 
    import requests
    import json
    
    def compare_rows(data: pd.DataFrame, column_name: str, threshold: int, operation: str) -> pd.DataFrame:
        if operation == 'greater_than':
            return data[data[column_name] > threshold]
        elif operation == 'less_than':
            return data[data[column_name] < threshold]
        elif operation == 'equal_to':
            return data[data[column_name] == threshold]
        else:
            raise ValueError("Unsupported operation")

    
    try:

        #data =pd.read_csv(data_path)
        response = requests.get(data_path)
        if response.status_code == 200:
            data_json = response.json()  # Parse the JSON data from the response
            data = pd.DataFrame(data_json)  # Create a DataFrame from the JSON data
        else:
            raise ValueError(f"problem in rendering the URL response from path '{data_path}'")  
        
        if column_name not in data.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")  
        
        filtered_data=compare_rows(data, column_name, threshold, operation)
        if filtered_data.empty:
            print("filtered dat was Empty !!!")
         
        with open(output_json.path, 'w') as f:
            json.dump(filtered_data.to_dict(orient='records'), f)  
            
        #filtered_data.to_csv(data_path, index=False)
        print(f"Filtered data has been written to {data_path}")
    except FileNotFoundError:
        print(f"Error: File '{data_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
        
@dsl.component(
    base_image="python:3.12.2",
    packages_to_install=['pandas']
)
def anonymize_columns(columns_to_anonymize: list, input_json: dsl.Input[Dataset],output_json: dsl.Output[Dataset]):
    #data_path: str,
    import pandas as pd 
    import json
    import hashlib
    #data_df = pd.read_csv(data_path)
    with open(input_json.path, 'r') as f:
        data = json.load(f)  
    # Check if data is a list of dictionaries
    if isinstance(data, list):
        data_df = pd.DataFrame(data)
    elif isinstance(data, dict):
        data_df = pd.DataFrame.from_dict(data)
    else:
        raise ValueError("Data is not in a valid format for DataFrame.")
    
    for column in columns_to_anonymize:
        if column in data_df.columns:
            # Anonymize the column using SHA-256 hashing
            data_df[column] = data_df[column].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        else:
            print(f"Warning: Column '{column}' not found in the data.")

        with open(output_json.path, 'w') as f:
            json.dump(data_df.to_dict(orient='records'), f)  
        #data_df.to_csv(data_path, index=False)
        
# Aggregation
@dsl.component(
    base_image="python:3.12.2",
    packages_to_install=['pandas','typing']
)
def aggregate_columns(
    groupby_column: str, 
    columns_to_aggregate: list,  # list[str]
    aggregation_functions: dict,  # dict[str, list[str]], 
    input_json: dsl.Input[Dataset],
    output_json: dsl.Output[Dataset]
):
    import pandas as pd  
    import json
    
    #data_df = pd.read_csv(data_path)
    with open(input_json.path, 'r') as f:
        data = json.load(f) 
    # Check if data is a list of dictionaries
    if isinstance(data, list):
        data_df = pd.DataFrame(data)
    elif isinstance(data, dict):
        data_df = pd.DataFrame.from_dict(data)
    else:
        raise ValueError("Data is not in a valid format for DataFrame.")
    
    try:
        # Perform the aggregation
        aggregated_data = data_df.groupby(groupby_column).agg(aggregation_functions)
         # Flatten multi-index columns if they exist
        if isinstance(aggregated_data.columns, pd.MultiIndex):
            aggregated_data.columns = ['_'.join(col).strip() for col in aggregated_data.columns.values]

        # Select only the required columns to aggregate
        #aggregated_data = aggregated_data[columns_to_aggregate]
        # Check if all columns to aggregate are in the result, then select them
        available_columns = [col for col in columns_to_aggregate if col in aggregated_data.columns]
        aggregated_data = aggregated_data[available_columns]
        aggregated_data.reset_index(inplace=True)
        # Save the result to the output path
        #aggregated_data.to_csv(data_path, index=False)
        print(f"Aggregated data has been written to {aggregated_data.to_dict(orient='records')}")
        with open(output_json.path, 'w') as f:
            json.dump(aggregated_data.to_dict(orient='records'), f)          
    except KeyError as e:
        raise f"Error: Column {e} not found in the data."
    except Exception as e:
        raise f"An error occurred: {e}"


# Function to compress JSON to GZIP
@dsl.component(
    base_image="python:3.12.2",
    packages_to_install=['pandas']
)
def compress_json_to_gzip(
    input_json: dsl.Input[Dataset],
    output_json: dsl.Output[Dataset]
):
    import json
    import gzip

    # Read JSON data from input file
    with open(input_json.path, 'r') as f:
        data = json.load(f)

    try:
        # Convert data to JSON string
        json_data = json.dumps(data)
        
        # Compress and write to GZIP file
        with gzip.open(output_json.path, 'wt', encoding='utf-8') as f:
            f.write(json_data)
    except Exception as e:
        print(f"An error occurred: {e}")
        # Optionally save an empty file in case of failure
        with open(output_json.path, 'w') as f:
            f.write('')
            
# Function to compress CSV to GZIP
@dsl.component(
    base_image="python:3.12.2",
    packages_to_install=['pandas']    
)
def compress_csv_to_gzip(
    input_json: dsl.Input[Dataset],
    output_json: dsl.Output[Dataset]
):
    import pandas as pd
    import json
    import gzip

    # Convert Dataset to DataFrame
    with open(input_json.path, 'r') as f:
        data = json.load(f)
    
    if isinstance(data, list):
        data_df = pd.DataFrame(data)
    elif isinstance(data, dict):
        data_df = pd.DataFrame.from_dict(data)
    else:
        raise ValueError("Data is not in a valid format for DataFrame.")

    # Compress DataFrame to CSV GZIP
    try:
        with gzip.open(output_json.path, 'wt') as f:
            data_df.to_csv(f, index=False)
    except Exception as e:
        raise Exception(f"An error occurred during compression: {e}")

In [143]:
# parse the yaml file to create the kubeflow pipeline:

# Deployment of the kubeflow
class Deployment:
    def __init__(self, namespace: str, prometheusURL: str ):
       self.namespace = namespace
       self.prometheusURL = prometheusURL
        
    def to_dict(self):
        return {
            'namespace': self.namespace,
            'prometheusURL': self.prometheusURL
        }        

#
class Stage:
    def __init__(self, name: str, type: str ,parameter: Dict[str, Any] ):
        self.name = name
        self.type = type
        self.parameter = parameter

    def to_dict(self):
        return {
            'name': self.name,
            'type': self.type,
            'parameter': self.parameter
        }
    
# pipeline name must be unique in the whole config file

class Pipeline:
    def __init__(self, name:str, flow:List[str], consumers:List[str]):
        self.flow = flow
        self.name = name
        self.consumers = consumers
        
    def to_dict(self):
        return {
            'name': self.name,
            'flow': self.flow,
            'consumers': self.consumers
        }
    
class PipelineChain:
    def __init__(self, name:str, flow:List[str]):
        self.flow = flow
        self.name = name

    def to_dict(self):
        return {
            'name': self.name,
            'flow': self.flow
        }
    
        
class PipelineConfig:
    def __init__(self,pipelines:List[Pipeline],stages:List[Stage],deployment: Deployment , pipeline_chains:List[PipelineChain]):
        self.pipelines = pipelines
        self.stages = stages
        self.deployment = deployment
        self.pipeline_chains = pipeline_chains

    def to_dict(self):
        return {
            'deployment': self.deployment.to_dict(),            
            'stages': [stage.to_dict() for stage in self.stages],
            'pipelines': [pipeline.to_dict() for pipeline in self.pipelines],
            'pipelineChains': [chain.to_dict() for chain in self.pipeline_chains]
        }        
         
        

In [144]:
def create_component_for_stage(stage: 'Stage'):
    if stage.type == 'filtering':
        return filtering
    elif stage.type == 'anonymization':
        return anonymize_columns
    elif stage.type == 'aggregation':
        return aggregate_columns
    elif stage.type == 'compress_json_to_gzip':
        return compress_json_to_gzip
    elif stage.type == 'compress_csv_to_gzip':
        return compress_csv_to_gzip
    else:
        print(f"Error: Unknown stage type '{stage.type}'")
        return None

In [145]:
# creating the pipelines based on the pipelines config file
from kubernetes.client import V1Pod, V1ObjectMeta

def dynamic_pipeline(data_path: str, pipeline_config: PipelineConfig, pipeline: Pipeline):
    # Process each pipeline stage dynamically
    global output_data
    
    for stage_name in pipeline.flow:
        # Find the stage by name
        stage = next((s for s in pipeline_config.stages if s.name == stage_name), None)

        if stage is None:
            print(f"Error: Stage {stage_name} not found in the configuration.")
            continue

        # Create component for the current stage
        component_op = create_component_for_stage(stage)
        task_name = f"{pipeline.name}-{stage.type}-{stage_name}".replace('_', '-')
        # Dynamically handle each stage
        if stage.type == 'filtering':
           
                # Ensure to pass the output as an Output type
                output_data=component_op(
                    data_path=data_path, 
                    operation=stage.parameter['operation'], 
                    column_name=stage.parameter['column_name'], 
                    threshold=stage.parameter['threshold']
                )
                #output_data.set_display_name(task_name)
                #output_data.pod_overrides = V1Pod(metadata=V1ObjectMeta(labels={"custom_name": task_name}))
  
        elif stage.type == 'anonymization':
                 output_data=component_op(
                    input_json=output_data.outputs['output_json'], 
                    columns_to_anonymize=stage.parameter['columns_to_anonymize']
                 )
                 #output_data.set_display_name(task_name)
                 
        
        elif stage.type == 'aggregation':
               output_data=component_op(
                    input_json=output_data.outputs['output_json'], 
                    groupby_column=stage.parameter['groupby_column'], 
                    columns_to_aggregate=stage.parameter['columns_to_aggregate'], 
                    aggregation_functions=stage.parameter['aggregation_functions']
        )
               #output_data.set_display_name(task_name)
                
        elif stage.type == 'compress_json_to_gzip':
               output_data=component_op(
                 input_json =output_data.outputs['output_json']
        )
               #output_data.set_display_name(task_name)

        elif stage.type == 'compress_csv_to_gzip':
                output_data=component_op(
                    input_json=output_data.outputs['output_json']
        )
                #output_data.set_display_name(task_name)
        else:
            raise Exception(f"Component for stage '{stage_name}' could not be created.")

    return output_data




In [146]:
# Function to read YAML and convert to PipelineConfig
def read_yaml_to_pipeline_config(file_path: str) -> PipelineConfig:
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)

        # Create Stage objects from the YAML stages
        stages = [Stage(name=stage['name'], type=stage['type'], parameter=stage['parameter']) for stage in data['stages']]

        # Create Pipeline objects from the YAML pipelines
        pipelines = []
        for pipeline in data['pipelines']:
            
            pipelines.append(Pipeline(name=pipeline['name'], flow=pipeline['flow'], consumers=pipeline['consumers']))

        deployment_data = data.get('Deployment', {})
        deployment = Deployment(namespace=deployment_data.get('namespace', ''),
                                prometheusURL=deployment_data.get('prometheusURL', ''))
        
        pipeline_chains = []
        if 'pipelineChains' in data:
            for pipelineChain in data['pipelineChains']:
                pipeline_chains.append(PipelineChain(name=pipelineChain['name'], flow=pipelineChain['flow']))
        
        # Create PipelineConfig object
        pipeline_config = PipelineConfig(pipelines=pipelines, stages=stages, deployment=deployment, pipeline_chains=pipeline_chains)

        return pipeline_config

In [147]:
# Function to dynamically handle pipeline chains
def dynamic_pipeline_chain(data_path: str, pipeline_config: PipelineConfig, chain: PipelineChain):

    final_output = None
    # Iterate through each pipeline in the chain's flow
    for pipeline_name in chain.flow:
        # Find the pipeline by name
        pipeline = next((p for p in pipeline_config.pipelines if p.name == pipeline_name), None)
        if pipeline is None:
            print(f"Error: Pipeline {pipeline_name} not found in the configuration.")
            continue
        
        # Call dynamic_pipeline for each pipeline in the chain
        final_output = dynamic_pipeline(data_path, pipeline_config, pipeline)
    
    return final_output

In [148]:
#TODO: (first) pipeline datasource.type must be csv and it's path.

# Adjusted pipeline creation for chains
def create_pipeline_for_each_chain(client: kfp.Client):
    ### PARAM_1 : REPLACE WITH YOUR CONFIGURATION FILE PATH
    pipeline_config_file = './data/shipments_pipeline_config.yaml'
    pipeline_config = read_yaml_to_pipeline_config(pipeline_config_file)
    
    # Iterate over each pipeline chain in the config
    for chain in pipeline_config.pipeline_chains:
        # Generate a unique name for each chain
        chain_name = f"chain_{chain.name}"

        # Define a Kubeflow pipeline for each chain
        @dsl.pipeline(
            name=chain_name,
            description=f"Pipeline chain generated from config: {chain.name}"
        )
        def kubeflow_pipeline():
            # Call dynamic_pipeline_chain to handle the chain of pipelines
            ### PARAM_2 : REPLACE WITH YOUR DATA SET URL
            dynamic_pipeline_chain('http://industry.teadal.ubiwhere.com/fdp-czech-plant/shipments', pipeline_config, chain)

        # Compile the pipeline chain
        chain_file_name = chain_name + '.yaml'
        kfp.compiler.Compiler().compile(pipeline_func=kubeflow_pipeline, package_path=chain_file_name)
        print(f"Pipeline chain '{chain_name}' compiled successfully.")

        # Upload the compiled chain to the specified namespace
        client.upload_pipeline(pipeline_package_path=chain_file_name, pipeline_name=chain_name)
        print(f"Pipeline chain '{chain_name}' uploaded successfully.")

    return "All pipeline chains compiled successfully!"

In [149]:
import kfp
import kfp.dsl as dsl
def main():
    
    client = kfp.Client() 
    # Create and compile pipelines based on the configuration
    result = create_pipeline_for_each_chain(client)
    print(result) 

# Call the main function
if __name__ == "__main__":
    main()

Pipeline chain 'chain_shipments_anonymize_chain' compiled successfully.


Pipeline chain 'chain_shipments_anonymize_chain' uploaded successfully.
Pipeline chain 'chain_shipments_data_chain' compiled successfully.


Pipeline chain 'chain_shipments_data_chain' uploaded successfully.
All pipeline chains compiled successfully!
