# This is KubeFlow pipeline Auto Generator

Below is the implementation of a pipeline autogenerate based on a config.yaml file. 


In [50]:
#!pip install kfp==1.8.22 pandas pyyaml

In [51]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
#from kfp.components import InputPath, OutputPath
import pandas as pd
import string
import hashlib
import gzip
#import json
from typing import List, Dict, Optional, Any
import yaml


ImportError: cannot import name 'create_component_from_func' from 'kfp.components' (/Users/sepideh.masoudi/miniconda3/lib/python3.12/site-packages/kfp/components/__init__.py)

In [None]:
# stages' function definitions

# filtering
def compare_rows(data: pd.DataFrame, column_name: str, threshold: str, operation: str) -> pd.DataFrame:
    if operation == 'greater_than':
        return data[data[column_name] > threshold]
    elif operation == 'less_than':
        return data[data[column_name] < threshold]
    elif operation == 'equal_to':
        return data[data[column_name] == threshold]
    else:
        raise ValueError("Unsupported operation")


# @dsl.component(
#     base_image="python:3.7",
#     packages_to_install=['pandas']
# )
def filtering(data: pd.DataFrame, operation: str, column_name: str, threshold: str) -> pd.DataFrame:
    return compare_rows(data, column_name, threshold, operation)


# Anonymization
# @dsl.component(
#     base_image="python:3.7",
#     packages_to_install=['pandas']
# )
def anonymize_columns(data: pd.DataFrame, columns_to_anonymize: List[str]) -> pd.DataFrame:
    for column in columns_to_anonymize:
        if column in data.columns:
            data[column] = data[column].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        else:
            print(f"Warning: Column '{column}' not found in the data.")
    return data


# Aggregation
# @dsl.component(
#     base_image="python:3.7",
#     packages_to_install=['pandas']
# )
def aggregate_columns(data: pd.DataFrame, groupby_column: str, columns_to_aggregate: List[str], aggregation_functions: Dict[str, List[str]]) -> pd.DataFrame:
    try:
        aggregated_data = data.groupby(groupby_column).agg(aggregation_functions)
        return aggregated_data[columns_to_aggregate]
    except KeyError as e:
        print(f"Error: Column {e} not found in the data.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# Function to compress JSON to GZIP
# @dsl.component(
#     base_image="python:3.7",
#     packages_to_install=['pandas', 'gzip']
# )
def compress_json_to_gzip(data: pd.DataFrame, output_file: str) -> Optional[str]:
    try:
        if data is not None:
            json_data = data.to_json(orient='records')  # Convert DataFrame to JSON
            with gzip.open(output_file, 'wt', encoding='utf-8') as f:
                f.write(json_data)
            return output_file  # Return the compressed file path
        else:
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# Function to compress CSV to GZIP
# @dsl.component(
#     base_image="python:3.7",
#     packages_to_install=['pandas', 'gzip']
# )
def compress_csv_to_gzip(data: pd.DataFrame, output_file: str) -> Optional[str]:
    try:
        data.to_csv(output_file, index=False, compression='gzip')  # Compress DataFrame to CSV GZIP
        return output_file  # Return the compressed file path
    except Exception as e:
        print(f"An error occurred: {e}")
        return None  
    
# def read_aggregate_and_convert_to_json(data: pd.DataFrame) -> Optional[str]:
#     try:
#         if data is not None:
#             json_result = data.to_json(orient='records')  # Convert DataFrame to JSON
#             return json_result
#         else:
#             return None
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return None    


In [None]:
# parse the yaml file to create the kubeflow pipeline:

# Deployment of the kubeflow
class Deployment:
    def __init__(self, namespace: str, prometheusURL: str ):
       self.namespace = namespace
       self.prometheusURL = prometheusURL

#
class Stage:
    def __init__(self, name: str, type: str ,parameter: Dict[str, Any] ):
        self.name = name
        self.type = type
        self.parameter = parameter

#type : csv , pipeline
#metadat = filepath or next name of pipeline to call.          
class Datasource:
    def __init__(self, type: str, metadata: str):
        self.type = type
        self.metadata = metadata

# pipeline name must be unique in the whole config file

class Pipeline:
    def __init__(self, name:string, flow:List[str], datasource:Datasource):
        self.flow = flow
        self.datasource = datasource
        self.name = name
        
class PipelineConfig:
    def __init__(self,pipelines:List[Pipeline],stages:List[Stage],deployment: Deployment):
        self.pipelines = pipelines
        self.stages = stages
        self.deployment = deployment
        
    

In [None]:
# Function to read YAML and convert to PipelineConfig
def read_yaml_to_pipeline_config(file_path: str) -> PipelineConfig:
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)

        # Create Stage objects from the YAML stages
        stages = [Stage(name=stage['name'], type=stage['type'], parameter=stage['parameter']) for stage in data['stages']]

        # Create Pipeline objects from the YAML pipelines
        pipelines = []
        for pipeline in data['pipelines']:
            datasource = Datasource(type=pipeline['datasource']['type'], metadata=pipeline['datasource']['metadata'])
            pipelines.append(Pipeline(name=pipeline['name'], flow=pipeline['flow'], datasource=datasource))

        deployment_data = data.get('Deployment', {})
        deployment = Deployment(namespace=deployment_data.get('namespace', ''),
                                prometheusURL=deployment_data.get('prometheusURL', ''))
        # Create PipelineConfig object
        pipeline_config = PipelineConfig(pipelines=pipelines, stages=stages, deployment=deployment)

        return pipeline_config

In [None]:
# kubeflow component declaration 

filtering_op =comp.create_component_from_func(
            filtering,
            base_image="python:3.7",
            packages_to_install=['pandas'])

anonymization_op =comp.create_component_from_func(
                        anonymize_columns,
                        base_image="python:3.7",
                        packages_to_install=['pandas'])

aggregation_op = comp.create_component_from_func(
                aggregate_columns,
                base_image="python:3.7",
                packages_to_install=['pandas'])


compress_json_to_gzip_op= comp.create_component_from_func(
            compress_json_to_gzip,
            base_image="python:3.7",
            packages_to_install=['pandas', 'gzip', 'json'])

compress_csv_to_gzip_op= comp.create_component_from_func(
            compress_csv_to_gzip,
            base_image="python:3.7",
            packages_to_install=['pandas', 'gzip'])

In [None]:
#create pipeline components based on pipeline config object

def create_component_for_stage(stage: 'Stage') -> comp.ComponentOp:

    if stage.type == 'filtering':
      return comp.create_component_from_func(
            filtering,
            base_image="python:3.7",
            packages_to_install=['pandas'])

    elif stage.type == 'anonymization':
        return comp.create_component_from_func(
                        anonymize_columns,
                        base_image="python:3.7",
                        packages_to_install=['pandas'])

    elif stage.type == 'aggregation':
       return comp.create_component_from_func(
                aggregate_columns,
                base_image="python:3.7",
                packages_to_install=['pandas'])

    elif stage.type == 'compress_json_to_gzip':
       return comp.create_component_from_func(
            compress_json_to_gzip,
            base_image="python:3.7",
            packages_to_install=['pandas', 'gzip', 'json'])

    elif stage.type == 'compress_csv_to_gzip':
        return comp.create_component_from_func(
            compress_csv_to_gzip,
            base_image="python:3.7",
            packages_to_install=['pandas', 'gzip'])
    else:
        print(f"Error: Unknown stage type '{stage.type}'")
        return None  


# def create_component_for_stage(stage: 'Stage'):
#     if stage.type == 'filtering':
#         return filtering
#     elif stage.type == 'anonymization':
#         return anonymize_columns
#     elif stage.type == 'aggregation':
#         return aggregate_columns
#     elif stage.type == 'compress_json_to_gzip':
#         return compress_json_to_gzip
#     elif stage.type == 'compress_csv_to_gzip':
#         return compress_csv_to_gzip
#     else:
#         print(f"Error: Unknown stage type '{stage.type}'")
#         return None


In [None]:
# creating the pipelines based on the pipelines config file

def dynamic_pipeline(df: pd.DataFrame, pipeline_config: PipelineConfig , pipeline:Pipeline):
    # Process each pipeline stage dynamically
    data_op = df
    
    for stage_name in pipeline.flow:
            # Find the stage by name
            stage = next((s for s in pipeline_config.stages if s.name == stage_name), None)

            if stage is None:
                print(f"Error: Stage {stage_name} not found in the configuration.")
                continue

            # Create component for the current stage
            component_op = create_component_for_stage(stage)

            # Dynamically handle each stage
            if component_op:
                # Depending on the type of stage, pass necessary parameters dynamically
                if stage.type == 'filtering':
                    data_op = component_op(
                        data=data_op, 
                        operation=stage.parameter['operation'], 
                        column_name=stage.parameter['column_name'], 
                        threshold=stage.parameter['threshold']
                    )

                elif stage.type == 'anonymization':
                    data_op = component_op(
                        data=data_op, 
                        columns_to_anonymize=stage.parameter['columns_to_anonymize']
                    )

                elif stage.type == 'aggregation':
                    data_op = component_op(
                        data=data_op, 
                        groupby_column=stage.parameter['groupby_column'], 
                        columns_to_aggregate=stage.parameter['columns_to_aggregate'], 
                        aggregation_functions=stage.parameter['aggregation_functions']
                    )

                elif stage.type == 'compress_json_to_gzip':
                    data_op = component_op(
                        data=data_op, 
                        output_file=stage.parameter['output_file']
                    )

                elif stage.type == 'compress_csv_to_gzip':
                    data_op = component_op(
                        data=data_op, 
                        output_file=stage.parameter['output_file']
                    )
            else:
                raise Exception(f"Component for stage '{stage_name}' could not be created.")

    return data_op




In [None]:
#TODO: (first) pipeline datasource.type must be csv and it's path.
#TODO: support pipelines chains (one pipeline call the other)

def create_pipeline_for_each_config(pipeline_config: PipelineConfig, client: kfp.Client):
    df = None  # Placeholder for your initial dataset, for example from a CSV
    # Iterate over every pipeline in the pipeline config
    for pipeline in pipeline_config.pipelines:
        # Generate a unique name for each pipeline
        pipeline_name = f"pipeline_{pipeline.name}"

        # Define a Kubeflow pipeline dynamically for each pipeline
        @dsl.pipeline(
            name=pipeline_name,
            description=f"Pipeline generated from config: {pipeline.name}"
        )
        def kubeflow_pipeline():
            # Call dynamic_pipeline to handle the stages of this specific pipeline
            dynamic_pipeline(df, pipeline_config, pipeline)

        # Compile the pipeline
        pipeline_file_name = pipeline_name + '.zip'
        kfp.compiler.Compiler().compile(kubeflow_pipeline, pipeline_file_name)
        print(f"Pipeline '{pipeline_name}' compiled successfully.")
        
        # Upload the compiled pipeline to the specified namespace
        client.upload_pipeline(pipeline_file_name, name=pipeline_name)
        print(f"Pipeline '{pipeline_name}' uploaded successfully.")

    return "All pipelines compiled successfully!"

In [None]:
def main():
    
    # Load the pipeline configuration from a YAML file
    pipeline_config_file = 'data\pipeline_config_sample.yaml'  # Make sure this file exists in your working directory
    pipeline_config = read_yaml_to_pipeline_config(pipeline_config_file)
    
    if pipeline_config.deployment.namespace is None:
        raise Exception('please specify the kubeflow namespace in the yml file.')
    
    client = kfp.Client(namespace=pipeline_config.deployment.namespace)
    # Create and compile pipelines based on the configuration
    result = create_pipeline_for_each_config(pipeline_config, client)
    print(result)  # Optionally print the result message

# Call the main function
if __name__ == "__main__":
    main()