In [7]:
import mlrun
from sklearn.datasets import load_breast_cancer
import pandas as pd

@mlrun.handler(outputs=["dataset", "label_column"])
def dataset_loader(context, format="csv"):

    iris = load_breast_cancer(as_frame=True)
    iris_dataset = iris.frame
    iris_dataset['target'] = iris.target
    
    context.logger.info('saving iris dataset to {}'.format(context.artifact_path))
    context.log_dataset('iris_dataset', df=iris_dataset, format=format, index=False)
    
    return iris_dataset, "target"

if __name__ == "__main__":
    with mlrun.get_or_create_ctx("iris_generator", upload_artifacts=True) as context:
        dataset_loader(context, context.get_param("format", "csv"))



> 2025-04-11 04:45:46,275 [info] saving iris dataset to s3://mlrun/projects/default/artifacts


In [3]:
import mlrun
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from mlrun.frameworks.sklearn import apply_mlrun

def train(
    dataset: mlrun.DataItem,
    label_column: str = 'target',
    n_estimators: int = 100,
    learning_rate: float = 0.1,
    max_depth: int = 3,
    model_name: str = "dataset_classifier"
):
    
    df = dataset.as_df()
    X = df.drop(label_column, axis =1)
    y = df[label_column]
    
    #train test split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Pick Model
    model = ensemble.GradientBoostingClassifier(
        n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    
    apply_mlrun(model=model, model_name=model_name, x_test=X_test, y_test=y_test)
    
    model.fit(X_train, y_train)

In [4]:
from cloudpickle import load
import numpy as np
from typing import List
import mlrun

class ClassifierModel(mlrun.serving.V2ModelServer):
    def load(self):

        model_file, extra_data = self.get_model('.pkl')
        self.model = load(open(model_file, 'rb'))
        
    def predict(self, body: dict) -> List:

        feats = np.asarray(body['inputs'])
        results: np.ndarray = self.model.predict(feats)
        return results.tolist()

In [6]:
import mlrun
from kfp import dsl

@dsl.pipeline(name="iris-demo")
def pipeline(model_name="dataset_classifier"):
    
    ingest = mlrun.run_function(
        "load-iris-data",
        name="load-iris-data",
        params={"format": "pq", "model_name": model_name},
        outputs=["dataset"],
    )
    
    train = mlrun.run_function(
        "trainer",
        inputs={"dataset": ingest.outputs["dataset"]},
        hyperparams={
        "n_estimators": [10, 100],
            "learning_rate": [1e-1, 1e-3],
            "max_depth": [2, 8]
        },
        selector="max.accuracy",
        outputs=["model"],
    )
    
    deploy = mlrun.deploy_function(
        "serving",
        models=[{"key": model_name, "model_path": train.outputs["model"], "class_name": "ClassifierModel"}],
        mock=True
    )