# This is KubeFlow pipeline Auto Generator

Below is the implementation of a pipeline autogenerate based on a config.yaml file. 


In [44]:
from numpy import number
!pip install kfp pandas pyyaml
!pip install --upgrade kfp




In [45]:
import kfp
import kfp.dsl as dsl
from kfp.v2.dsl import Input, Output, Dataset
import string
import hashlib
import gzip
from typing import List, Dict, Optional, Any
import yaml
import pandas as pd
import pyarrow.csv as pv

In [46]:
# stages' function definitions

@dsl.component(
    base_image="python:3.7",
    packages_to_install=['pyarrow']
)
# filtering
def filtering(
    data: str ,  # Input dataset Input[Dataset]
    operation: str, 
    column_name: str, 
    threshold: number,
    filtered_output: str  # Output dataset
) :
    

    # Read the input dataset
    df =pv.read_csv(data)

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    
    # Perform filtering based on the specified operation
    if operation == 'greater_than':
        filtered_data = df[df[column_name] > float(threshold)]
    elif operation == 'less_than':
        filtered_data = df[df[column_name] < float(threshold)]
    elif operation == 'equal_to':
        filtered_data = df[df[column_name] == float(threshold)]
    else:
        raise ValueError("Unsupported operation")

    # Save the filtered DataFrame as a new CSV file
    #filtered_data.to_csv(filtered_output.path, index=False)
    pv.write_csv(filtered_data, output_file=filtered_output)

# Anonymization
@dsl.component(
    base_image="python:3.7",
    packages_to_install=['pandas']
)
def anonymize_columns(
    data: Input[Dataset], 
    columns_to_anonymize: list,  # Update typing for list of columns
    anonymized_output: Output[Dataset]  # Output dataset
):
    # Convert the Input dataset to a DataFrame
    data_df = pv.read_csv(data.path)

    for column in columns_to_anonymize:
        if column in data_df.columns:
            # Anonymize the column using SHA-256 hashing
            data_df[column] = data_df[column].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        else:
            print(f"Warning: Column '{column}' not found in the data.")

    # Save the modified DataFrame to the output path
        pv.write_csv(data_df, anonymized_output.path)



# Aggregation
@dsl.component(
    base_image="python:3.7",
    packages_to_install=['pandas']
)
def aggregate_columns(
    data: Input[Dataset], 
    groupby_column: str, 
    columns_to_aggregate: list,  # list[str]
    aggregation_functions: dict,  # dict[str, list[str]]
    aggregated_output: Output[Dataset]  # Output dataset
):
    # Convert Dataset to DataFrame
    data_df = pv.read_csv(data.path)
    
    try:
        # Perform the aggregation
        aggregated_data = data_df.groupby(groupby_column).agg(aggregation_functions)
        
        # Select only the required columns to aggregate
        aggregated_data = aggregated_data[columns_to_aggregate]
        
        # Save the result to the output path
        pv.write_csv(aggregated_data, aggregated_output.path)
    except KeyError as e:
        raise f"Error: Column {e} not found in the data."
    except Exception as e:
        raise f"An error occurred: {e}"


# Function to compress JSON to GZIP
@dsl.component(
    base_image="python:3.7",
    packages_to_install=['pandas', 'gzip']
)
def compress_json_to_gzip(
    data: Input[Dataset], 
    compressed_output: Output[Dataset]  # Output dataset
):
    # Convert Dataset to DataFrame
    data_df = pv.read_csv(data.path)
    
    try:
        # Convert DataFrame to JSON
        json_data = data_df.to_json(orient='records')
        
        # Compress and write to GZIP file
        with gzip.open(compressed_output.path, 'wt', encoding='utf-8') as f:
            f.write(json_data)
    except Exception as e:
        print(f"An error occurred: {e}")
        # Optionally save an empty file in case of failure
        with open(compressed_output.path, 'w') as f:
            f.write('')

# Function to compress CSV to GZIP
@dsl.component(
    base_image="python:3.7",
    packages_to_install=['pandas', 'gzip']
)
def compress_csv_to_gzip(
    data: Input[Dataset], 
    compressed_output: Output[Dataset]  # Output dataset
):
    # Convert Dataset to DataFrame
    data_df = pv.read_csv(data.path)
    
    try:
        # Compress DataFrame to CSV GZIP
        with gzip.open(compressed_output.path, 'wt') as f:
            pv.write_csv(data_df, f)
    except Exception as e:
        raise f"An error occurred: {e}"


TypeError: Artifacts must have both a schema_title and a schema_version, separated by `@`. Got: number

In [None]:
# parse the yaml file to create the kubeflow pipeline:

# Deployment of the kubeflow
class Deployment:
    def __init__(self, namespace: str, prometheusURL: str ):
       self.namespace = namespace
       self.prometheusURL = prometheusURL

#
class Stage:
    def __init__(self, name: str, type: str ,parameter: Dict[str, Any] ):
        self.name = name
        self.type = type
        self.parameter = parameter

#type : csv , pipeline
#metadat = filepath or next name of pipeline to call.          
class Datasource:
    def __init__(self, type: str, metadata: str):
        self.type = type
        self.metadata = metadata

# pipeline name must be unique in the whole config file

class Pipeline:
    def __init__(self, name:string, flow:List[str], datasource:Datasource):
        self.flow = flow
        self.datasource = datasource
        self.name = name
        
class PipelineConfig:
    def __init__(self,pipelines:List[Pipeline],stages:List[Stage],deployment: Deployment):
        self.pipelines = pipelines
        self.stages = stages
        self.deployment = deployment
        
    

In [None]:
# Function to read YAML and convert to PipelineConfig
def read_yaml_to_pipeline_config(file_path: str) -> PipelineConfig:
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)

        # Create Stage objects from the YAML stages
        stages = [Stage(name=stage['name'], type=stage['type'], parameter=stage['parameter']) for stage in data['stages']]

        # Create Pipeline objects from the YAML pipelines
        pipelines = []
        for pipeline in data['pipelines']:
            datasource = Datasource(type=pipeline['datasource']['type'], metadata=pipeline['datasource']['metadata'])
            pipelines.append(Pipeline(name=pipeline['name'], flow=pipeline['flow'], datasource=datasource))

        deployment_data = data.get('Deployment', {})
        deployment = Deployment(namespace=deployment_data.get('namespace', ''),
                                prometheusURL=deployment_data.get('prometheusURL', ''))
        # Create PipelineConfig object
        pipeline_config = PipelineConfig(pipelines=pipelines, stages=stages, deployment=deployment)

        return pipeline_config

In [None]:
# kubeflow component declaration 

# filtering_op =kfp.components.create_component_from_func(
#             filtering,
#             base_image="python:3.7",
#             packages_to_install=['pandas'])
# 
# anonymization_op =kfp.components.create_component_from_func(
#                         anonymize_columns,
#                         base_image="python:3.7",
#                         packages_to_install=['pandas'])
# 
# aggregation_op = comp.create_component_from_func(
#                 aggregate_columns,
#                 base_image="python:3.7",
#                 packages_to_install=['pandas'])
# 
# 
# compress_json_to_gzip_op= comp.create_component_from_func(
#             compress_json_to_gzip,
#             base_image="python:3.7",
#             packages_to_install=['pandas', 'gzip', 'json'])
# 
# compress_csv_to_gzip_op= comp.create_component_from_func(
#             compress_csv_to_gzip,
#             base_image="python:3.7",
#             packages_to_install=['pandas', 'gzip'])

In [None]:
#create pipeline components based on pipeline config object

# def create_component_for_stage(stage: 'Stage') -> comp.ComponentOp:
#     
#     if stage.type == 'filtering':
#       return kfp.components.create_component_from_func(
#             filtering,
#             base_image="python:3.7",
#             packages_to_install=['pandas'])
#     
#     elif stage.type == 'anonymization':
#         return kfp.components.create_component_from_func(
#                         anonymize_columns,
#                         base_image="python:3.7",
#                         packages_to_install=['pandas'])
#     
#     elif stage.type == 'aggregation':
#        return comp.create_component_from_func(
#                 aggregate_columns,
#                 base_image="python:3.7",
#                 packages_to_install=['pandas'])
#        
#     elif stage.type == 'compress_json_to_gzip':
#        return comp.create_component_from_func(
#             compress_json_to_gzip,
#             base_image="python:3.7",
#             packages_to_install=['pandas', 'gzip', 'json'])
#     
#     elif stage.type == 'compress_csv_to_gzip':
#         return comp.create_component_from_func(
#             compress_csv_to_gzip,
#             base_image="python:3.7",
#             packages_to_install=['pandas', 'gzip'])
#     else:
#         print(f"Error: Unknown stage type '{stage.type}'")
#         return None  
#   

def create_component_for_stage(stage: 'Stage'):
    if stage.type == 'filtering':
        return filtering
    elif stage.type == 'anonymization':
        return anonymize_columns
    elif stage.type == 'aggregation':
        return aggregate_columns
    elif stage.type == 'compress_json_to_gzip':
        return compress_json_to_gzip
    elif stage.type == 'compress_csv_to_gzip':
        return compress_csv_to_gzip
    else:
        print(f"Error: Unknown stage type '{stage.type}'")
        return None


In [None]:
# creating the pipelines based on the pipelines config file

def dynamic_pipeline(data: Input[Dataset], pipeline_config: PipelineConfig, pipeline: Pipeline):
    # Process each pipeline stage dynamically
    data_op = data

    for stage_name in pipeline.flow:
        # Find the stage by name
        stage = next((s for s in pipeline_config.stages if s.name == stage_name), None)

        if stage is None:
            print(f"Error: Stage {stage_name} not found in the configuration.")
            continue

        # Create component for the current stage
        component_op = create_component_for_stage(stage)

        # Dynamically handle each stage
        if component_op:
            # Create the output for the current stage
            if stage.type == 'filtering':
                filtered_output = "/data/"+pipeline.name+"_"+stage_name + "_filtered.csv"  
                
                # Ensure to pass the output as an Output type
                data_op = component_op(
                    data=data_op, 
                    operation=stage.parameter['operation'], 
                    column_name=stage.parameter['column_name'], 
                    threshold=stage.parameter['threshold'],
                    # Pass the filtered_output correctly
                    filtered_output=filtered_output
                )

            elif stage.type == 'anonymization':
                anonymized_output = dsl.Output[Dataset](stage_name + "_anonymized")
                data_op = component_op(
                    data=data_op, 
                    columns_to_anonymize=stage.parameter['columns_to_anonymize'],
                    anonymized_output=anonymized_output  # Pass output variable here
                )

            elif stage.type == 'aggregation':
                aggregated_output = dsl.Output[Dataset](stage_name + "_aggregated")
                data_op = component_op(
                    data=data_op, 
                    groupby_column=stage.parameter['groupby_column'], 
                    columns_to_aggregate=stage.parameter['columns_to_aggregate'], 
                    aggregation_functions=stage.parameter['aggregation_functions'],
                    aggregated_output=aggregated_output  # Pass output variable here
                )

            elif stage.type == 'compress_json_to_gzip':
                compressed_output = dsl.Output[Dataset](stage_name + "_compressed_json")
                data_op = component_op(
                    data=data_op, 
                    compressed_output=compressed_output  # Pass output variable here
                )

            elif stage.type == 'compress_csv_to_gzip':
                compressed_output = dsl.Output[Dataset](stage_name + "_compressed_csv")
                data_op = component_op(
                    data=data_op, 
                    compressed_output=compressed_output  # Pass output variable here
                )
        else:
            raise Exception(f"Component for stage '{stage_name}' could not be created.")

    return data_op





In [None]:
#TODO: (first) pipeline datasource.type must be csv and it's path.
#TODO: support pipelines chains (one pipeline call the other)
#TODO: support conditional pipeline chain and components

def create_pipeline_for_each_config(pipeline_config: PipelineConfig, client: kfp.Client):
    # Iterate over every pipeline in the pipeline config
    for pipeline in pipeline_config.pipelines:
        # Generate a unique name for each pipeline
        pipeline_name = f"pipeline_{pipeline.name}"

        # Define a Kubeflow pipeline dynamically for each pipeline
        @dsl.pipeline(
            name=pipeline_name,
            description=f"Pipeline generated from config: {pipeline.name}"
        )
        def kubeflow_pipeline():
            
            #data = pd.read_csv(pipeline.datasource.metadata)
            #temp_file_path = 'temp_data.csv'  # Change this to a suitable path
            # data.to_csv(temp_file_path, index=False)
            # if not data.empty:
            #     dataset = Dataset(data)
            # else:
            #     raise ValueError("The provided DataFrame is empty.")
            data = pv.read_csv(pipeline.datasource.metadata)  # Load CSV into an Arrow Table
            #dataset = Dataset(data) 
            # Call dynamic_pipeline to handle the stages of this specific pipeline
            dynamic_pipeline(pipeline.datasource.metadata, pipeline_config, pipeline)

        # Compile the pipeline
        pipeline_file_name = pipeline_name + '.zip'
        kfp.compiler.Compiler().compile(kubeflow_pipeline, pipeline_file_name)
        print(f"Pipeline '{pipeline_name}' compiled successfully.")
        
        # Upload the compiled pipeline to the specified namespace
        client.upload_pipeline(pipeline_file_name, name=pipeline_name)
        print(f"Pipeline '{pipeline_name}' uploaded successfully.")

    return "All pipelines compiled successfully!"

In [None]:
def main():
    
    # Load the pipeline configuration from a YAML file
    pipeline_config_file = './data/pipeline_config_sample.yaml'  # Make sure this file exists in your working directory
    pipeline_config = read_yaml_to_pipeline_config(pipeline_config_file)
    
    if pipeline_config.deployment.namespace is None:
        raise Exception('please specify the kubeflow namespace in the yml file.')
    
    client = kfp.Client(namespace=pipeline_config.deployment.namespace)
    # Create and compile pipelines based on the configuration
    result = create_pipeline_for_each_config(pipeline_config, client)
    print(result)  # Optionally print the result message

# Call the main function
if __name__ == "__main__":
    main()