In [1]:
# First cell - Update function and dataset naming
import mlrun
from sklearn.datasets import load_breast_cancer
import pandas as pd

@mlrun.handler(outputs=["dataset", "label_column"])
def dataset_loader(context, format="csv"):

    breast_cancer = load_breast_cancer(as_frame=True)
    breast_cancer_dataset = breast_cancer.frame
    breast_cancer_dataset['target'] = breast_cancer.target
    
    context.logger.info('saving breast cancer dataset to {}'.format(context.artifact_path))
    context.log_dataset('breast_cancer_dataset', df=breast_cancer_dataset, format=format, index=False)
    
    return breast_cancer_dataset, "target"

if __name__ == "__main__":
    with mlrun.get_or_create_ctx("breast_cancer_generator", upload_artifacts=True) as context:
        dataset_loader(context, context.get_param("format", "csv"))

> 2025-04-11 08:35:54,091 [info] logging run results to: http://mlrun-api:8080
> 2025-04-11 08:35:54,236 [info] saving breast cancer dataset to s3://mlrun/projects/default/artifacts


In [2]:
import mlrun
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlrun.frameworks.sklearn import apply_mlrun

def train(
    dataset: mlrun.DataItem,
    label_column: str = 'target',
    n_estimators: int = 100,
    max_depth: int = 3,
    min_samples_split: int = 2,
    model_name: str = "dataset_classifier"
):
    
    df = dataset.as_df()
    X = df.drop(label_column, axis=1)
    y = df[label_column]
    
    #train test split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    # Properly wrap the model with MLRun
    apply_mlrun(model=model, model_name=model_name, x_test=X_test, y_test=y_test)
    
    # Train the model - return the model to fix missing return value
    model.fit(X_train, y_train)
    return model

In [3]:
from cloudpickle import load
import numpy as np
from typing import List
import mlrun

class ClassifierModel(mlrun.serving.V2ModelServer):
    def load(self):

        model_file, extra_data = self.get_model('.pkl')
        self.model = load(open(model_file, 'rb'))
        
    def predict(self, body: dict) -> List:

        feats = np.asarray(body['inputs'])
        results: np.ndarray = self.model.predict(feats)
        return results.tolist()

In [4]:
import mlrun
from kfp import dsl

@dsl.pipeline(name="breast-cancer-demo")
def pipeline(model_name="dataset_classifier"):
    
    ingest = mlrun.run_function(
        "load-breast-cancer-data",
        name="load-breast-cancer-data",
        params={"format": "pq", "model_name": model_name},
        outputs=["dataset"],
    )
    
    train = mlrun.run_function(
        "trainer",
        inputs={"dataset": ingest.outputs["dataset"]},
        hyperparams={
            "n_estimators": [10, 100],
            "max_depth": [2, 8],
            "min_samples_split": [2, 5]
        },
        selector="max.accuracy",
        outputs=["model"],
    )
    
    deploy = mlrun.deploy_function(
        "serving",
        models=[{"key": model_name, "model_path": train.outputs["model"], "class_name": "ClassifierModel"}],
        mock=True
    )

In [5]:
# Add this to a new cell

# Initialize or connect to MLRun project
project = mlrun.new_project("mlrun", "./", user_project=True, init_git=True)
# Register the functions with our project
project.set_function("Untitled1.ipynb", "load-breast-cancer-data", image="mlrun/mlrun", handler="dataset_loader")
project.set_function("Untitled1.ipynb", "trainer", image="mlrun/mlrun", handler="train")
project.set_function(name="serving", func="hub://v2_model_server", handler="ClassifierModel")

# Save the project
project.save()

> 2025-04-11 08:35:56,537 [info] Created and saved project: {"context":"./","from_template":null,"name":"breast-cancer-project","overwrite":false,"save":true}
> 2025-04-11 08:35:56,540 [info] Project created successfully: {"project_name":"breast-cancer-project","stored_in_db":true}
This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-co




BuildError: cannot convert notebook

In [None]:

# Create a run for the pipeline
run = project.run(
    workflow_name="breast-cancer-pipeline",
    arguments={
        "model_name": "breast-cancer-classifier"
    },
    artifact_path=project.artifact_path
)

In [None]:
# Add this to a new cell
# Get pipeline run status and results
pipeline_run = project.get_run(run.id)
print(f"Pipeline status: {pipeline_run.status}")

# Retrieve the model
model = project.get_artifact("model", "breast-cancer-classifier")
print(f"Model details: {model}")

In [None]:
# Add this to a new cell
# Test the deployed model with sample data
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Get sample data
breast_cancer = load_breast_cancer(as_frame=True)
sample_data = breast_cancer.frame.iloc[:5].drop("target", axis=1).values.tolist()

# Test inference
server = project.get_service("serving")
response = server.invoke({"inputs": sample_data})
print(f"Prediction results: {response}")

In [None]:
# Add this to a new cell
# Get MLRun UI URL
print(f"MLRun UI: {mlrun.get_run_db().url}")