In [4]:

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn


In [5]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [6]:
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df['target'] = cancer['target']
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [9]:

#Pull out some test data to us with the mlflow model api.


train, test = train_test_split(df, test_size=0.2)
test_target = test['target']
test[['target']].to_csv('test-target.csv', index=False)
del test['target']
test.to_csv('test.csv', index=False)

In [10]:
#Split training data, build a pipeline, and train a model.


features = [x for x in list(train.columns) if x != 'target']
x_raw = train[features]
y_raw = train['target']

x_train, x_test, y_train, y_test = train_test_split(x_raw, y_raw,
                                                            test_size=.20,
                                                            random_state=123,
                                                            stratify=y_raw)
clf = RandomForestClassifier(n_estimators=100,
                            min_samples_leaf=2,
                            class_weight='balanced',
                            random_state=123)

preprocessor = Pipeline(steps=[('scaler', StandardScaler())])

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('randomforestclassifier', clf)])
model.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('scaler', StandardScaler())])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight='balanced',
                                        min_samples_leaf=2,
                                        random_state=123))])

In [13]:
#Check out some model metrics.

accuracy_train = model.score(x_train, y_train)
accuracy_train


0.9945054945054945

In [14]:

model.score(x_test, y_test)

0.967032967032967

In [11]:


model.get_params()


{'memory': None,
 'steps': [('preprocessor', Pipeline(steps=[('scaler', StandardScaler())])),
  ('randomforestclassifier',
   RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                          random_state=123))],
 'verbose': False,
 'preprocessor': Pipeline(steps=[('scaler', StandardScaler())]),
 'randomforestclassifier': RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                        random_state=123),
 'preprocessor__memory': None,
 'preprocessor__steps': [('scaler', StandardScaler())],
 'preprocessor__verbose': False,
 'preprocessor__scaler': StandardScaler(),
 'preprocessor__scaler__copy': True,
 'preprocessor__scaler__with_mean': True,
 'preprocessor__scaler__with_std': True,
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__ccp_alpha': 0.0,
 'randomforestclassifier__class_weight': 'balanced',
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier

In [15]:
#update model to give probabilities instead of binary target.

def overwrite_predict(func):
        def wrapper(*args, **kwargs):
            result = func(*args, **kwargs)
            return [round(x, 4) for x in result[:, 1]]
        return wrapper

model.predict = overwrite_predict(model.predict_proba)



In [16]:
#Set mlflow configuration.
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("my-experiment")

2023/11/05 11:39:22 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/142079793624959880', creation_time=1699164562519, experiment_id='142079793624959880', last_update_time=1699164562519, lifecycle_stage='active', name='my-experiment', tags={}>

In [17]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
model_name = "clf-model"

with mlflow.start_run() as run:
  run_num = run.info.run_id
  model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_num, artifact_path=model_name)

  mlflow.log_metric('accuracy_train', accuracy_train)
  mlflow.sklearn.log_model(model, model_name)
  #mlflow.sklearn.save_model(model, "clf-model")

  model_details = mlflow.register_model(
    model_uri=model_uri,
    name=model_name)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Successfully registered model 'clf-model'.
2023/11/05 11:40:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: clf-model, version 1
Created version '1' of model 'clf-model'.


In [18]:
model_version_infos = client.search_model_versions("name = '%s'" % model_name)
new_model_version = max([model_version_info.version for model_version_info in model_version_infos])

In [19]:
from mlflow.entities.model_registry.model_version_status import ModelVersionStatus

def wait_model_transition(model_name, model_version, stage):
  client = MlflowClient()
  for _ in range(10):
    model_version_details = client.get_model_version(
      name=model_name,
      version=model_version,
    )
    status = ModelVersionStatus.from_string(model_version_details.status)
    print("Model status: %s" % ModelVersionStatus.to_string(status))
    if status == ModelVersionStatus.READY:
      client.transition_model_version_stage(
          name=model_name,
          version=model_version,
          stage=stage,
        )
      break
    time.sleep(1)

In [20]:
try:
  wait_model_transition(model_name, int(new_model_version)-1, "None")
except:
  pass

wait_model_transition(model_name, new_model_version, "Staging")

Model status: READY


In [21]:
new_model_version

'1'

In [22]:
client.update_model_version(
  name=model_name,
  version=new_model_version,
  description="This model is a random forest classifier for the breast cancer dataset from sklearn."
)

<ModelVersion: aliases=[], creation_timestamp=1699164604199, current_stage='Staging', description=('This model is a random forest classifier for the breast cancer dataset from '
 'sklearn.'), last_updated_timestamp=1699164861235, name='clf-model', run_id='3088d8a43f3647119718e7c5def230bc', run_link='', source='mlflow-artifacts:/142079793624959880/3088d8a43f3647119718e7c5def230bc/artifacts/clf-model', status='READY', status_message='', tags={}, user_id='', version='1'>

# Pulling a model from the registry.

In [23]:
import mlflow.pyfunc

model_name = "clf-model"
stage = 'Staging'

loaded_model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [24]:
predicted_probs = loaded_model.predict(test[features])
predicted_probs

[0.1175,
 0.0436,
 0.9782,
 1.0,
 0.9543,
 0.9721,
 1.0,
 0.9849,
 0.0046,
 0.9862,
 1.0,
 1.0,
 0.9391,
 1.0,
 1.0,
 0.0,
 0.2641,
 0.0222,
 0.99,
 0.5657,
 0.0,
 0.9409,
 1.0,
 0.0,
 0.9538,
 1.0,
 1.0,
 0.3482,
 0.6295,
 0.0165,
 0.9058,
 1.0,
 0.9899,
 0.9664,
 1.0,
 0.0373,
 0.1512,
 0.0158,
 0.1526,
 0.9873,
 0.9447,
 0.0,
 1.0,
 0.8492,
 0.8259,
 0.0271,
 1.0,
 0.872,
 0.9535,
 0.145,
 0.0063,
 0.7071,
 0.8967,
 1.0,
 1.0,
 0.959,
 1.0,
 0.0438,
 0.9175,
 0.0,
 1.0,
 0.0,
 0.2998,
 0.0,
 0.0,
 0.0163,
 0.9936,
 0.9873,
 0.4898,
 0.0369,
 0.9762,
 0.6766,
 0.0136,
 0.9512,
 0.9301,
 0.0,
 0.9847,
 1.0,
 1.0,
 0.0136,
 1.0,
 0.0036,
 1.0,
 0.0122,
 1.0,
 0.4494,
 0.0136,
 0.88,
 0.0058,
 0.9607,
 0.8753,
 0.99,
 0.8679,
 0.0727,
 0.0,
 0.3854,
 0.0,
 0.9386,
 0.0,
 0.0036,
 1.0,
 0.1579,
 0.5473,
 0.159,
 0.9936,
 0.0797,
 0.9607,
 1.0,
 0.3536,
 0.952,
 0.3201,
 0.0063,
 0.9576,
 1.0]

In [25]:
#Make sure model is served at port 1234
api_response = !curl http://localhost:1234/invocations  -H 'Content-Type: text/csv' --data-binary @test.csv
api_response

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0',
 '  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0',
 '  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0',
 "curl: (7) Failed to connect to localhost port 1234 after 2251 ms: Couldn't connect to server",
 '  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '  0     0    0     0    0     0      0      0 --:--:--  0:00:

In [29]:
mlflow.end_run()

