In [1]:
import os
import kfp
from kfp import dsl

In [2]:
domain = "kubeflow.at.onplural.sh"

In [3]:
def read_token_from_file(path=None):
    """Read a token found in some file."""
    token = None
    with open(path, "r") as f:
        token = f.read().strip()
    return token

class ClientV2(kfp.Client):
    def _get_url_prefix(self):
        return f"https://{domain}/_/pipeline"

client = ClientV2(
    host='http://kubeflow-pipelines-api-server.kubeflow.svc.cluster.local:8888',
    existing_token=read_token_from_file(path=os.getenv('KF_PIPELINES_SA_TOKEN_PATH'))
)

In [7]:
namespace = os.environ["NAMESPACE"]
docker_image = "ghcr.io/opengptx/spark/python:pr-10"

def get_resource(
        application_file : str,
        driver_cores: int,
        driver_memory_gb: int,
        executor_instances: int,
        executor_cores: int,
        executor_memory_gb: int,
    ):
    resource = {
        "apiVersion": "sparkoperator.k8s.io/v1beta2",
        "kind": "SparkApplication",
        "metadata": {
            "name": "spark-kfp",
            "namespace": namespace,
        },
        "spec": {
            "type": "Python",
            "mode": "cluster",
            "image": docker_image,
            "imagePullPolicy": "Always",
            "mainApplicationFile": application_file,
            "sparkVersion": "3.2.1",
            "restartPolicy": {
                "type": "Never"
            },
            "sparkConf": {
                    "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
                    "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
            },
            "driver": {
                "cores": driver_cores,
                "coreLimit": f"{1200*driver_cores}m",
                "memory": f"{driver_memory_gb}G",
                "labels": {
                    "version": "3.2.1",
                },
                "serviceAccount": "default-editor",
            },
            "executor": {
                "cores": executor_cores,
                "instances": executor_instances,
                "memory": f"{executor_memory_gb}G",
            },
        }
    }

    return resource

In [8]:
@dsl.pipeline(name="spark_pipeline", description="Spark KFP Example")
def local_pipeline():
    step1 = dsl.ResourceOp(
        name="Create Numbers Dataframe",
        k8s_resource=get_resource(
            application_file="local:///opt/spark/examples/num.py",
            driver_cores=1,
            driver_memory_gb=1,
            executor_instances=1,
            executor_cores=1,
            executor_memory_gb=1
        ),
        action="apply",
        success_condition="status.applicationState.state == COMPLETED"
    )

    step2 = dsl.ResourceOp(
        name="Square the Numbers",
        k8s_resource=get_resource(
            application_file="local:///opt/spark/examples/num_squared.py",
            driver_cores=1,
            driver_memory_gb=1,
            executor_instances=1,
            executor_cores=1,
            executor_memory_gb=1
        ),
        action="apply",
        success_condition="status.applicationState.state == COMPLETED"
    ).after(step1).delete()

In [9]:
run = client.create_run_from_pipeline_func(
    local_pipeline,
    namespace=namespace,
    arguments={},
    experiment_name="Spark KFP Test",
)
print("Kubeflow Pipelines run id: {}".format(run.run_id))



Kubeflow Pipelines run id: 141a6138-5b2d-484f-ada2-800e4704191e
