# Govern ml artifacts

We will use MLFLOW to get a first approach of what is a model registry, and how to use it

In [41]:
!pip install mlflow boto boto3 xgboost==1.1.1

Collecting xgboost==1.1.1
  Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
[K     |████████████████████████████████| 127.6 MB 14 kB/s s eta 0:00:01    |██████                          | 24.1 MB 11.5 MB/s eta 0:00:09 MB/s eta 0:00:01     |██████████████████████▏         | 88.2 MB 69.9 MB/s eta 0:00:01
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.7.3
    Uninstalling xgboost-1.7.3:
      Successfully uninstalled xgboost-1.7.3
Successfully installed xgboost-1.1.1


In [4]:
import mlflow.xgboost
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
from mlflow import MlflowClient
from sklearn.ensemble import RandomForestRegressor
import pandas
import xgboost
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
import pyarrow
import os

### MLFLOW related configuration

In [6]:
os.environ["AWS_ACCESS_KEY_ID"] = "mlflow-storage"
os.environ["AWS_SECRET_ACCESS_KEY"] = "mlflow-storage"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage-api.aiengineer.polytech.sandbox-atos.com"
mlflow.set_tracking_uri('http://mlflow.mlflow.svc.cluster.local:5000')

### Minimal model creation / training

In [7]:
def xgboost_train(
    training_data,
    booster_params,
    label_column: int = 0, 
    num_iterations: int = 10
):
        
    ### load data ###
    df = ...
    
    ### autoclean data to allow only copatible types in features
    numerics = ...
    df = df.select_dtypes(include=...)
    df = df.dropna()
    
    ### define data and label from full data
    data=df.drop(...)
    label=...
    dtrain = xgboost.DMatrix(data, label=label)
    
    model = xgboost.train(
        params=booster_params,
        dtrain=dtrain,
        num_boost_round=num_iterations,
    )
    
    return model
    

### booster parameters

In [8]:
params = {
    "objective":'reg:squarederror',
    "booster":'gbtree',
    "learning_rate": 0.3,
    "min_split_loss": 0,
    "max_depth": 6,
}

### MLFLOW tracker

To use correctly the mlflow model registry, we will track our training in mlflow 

In [9]:
### list all mlflow experiments
mlflow...

[<Experiment: artifact_location='s3://guillaume-etevenard/mlflowmodels/experiments-tp4', creation_time=None, experiment_id='10', last_update_time=None, lifecycle_stage='active', name='Guillaume Etevenard experiments', tags={'version': 'TP4'}>]

### Create a new experiment with "experiment_id" wired to our minio bucket 

In [None]:
### example : 'john-doe'
username=''

In [None]:
experiment_name = f"{username} experiments"
experiment_id = mlflow.create_experiment(
    experiment_name,
    artifact_location=f's3://{username}/mlflowmodels/experiments-tp4',
    tags={"version": "TP4"}
)

In [10]:
experiment = mlflow.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: Guillaume Etevenard experiments
Experiment_id: 10
Artifact Location: s3://guillaume-etevenard/mlflowmodels/experiments-tp4
Tags: {'version': 'TP4'}
Lifecycle_stage: active


### Train the model

In [11]:
### this will log all params availables into mlflow
mlflow.xgboost.autolog()

### this context manager will bind the process to an experiment
with mlflow.start_run(experiment_id=10) as run:
    model =...

### inspect results

In [64]:
f'click https://ml-registry.aiengineer.polytech.sandbox-atos.com/experiments/{experiment_id}'

'click https://ml-registry.aiengineer.polytech.sandbox-atos.com/experiments/10'

![exper](./images/exper.png)

### Register model into registry

In [12]:
client = MlflowClient(tracking_uri='http://mlflow.mlflow.svc.cluster.local:5000')
name="ge_chicago_taxi_tips"

#### Create the object model

In [None]:
client.create_registered_model(name)
# Create a new version of the rfr model under the registered model name

#### Create a version

In [21]:
desc = "A new 55"
runs_uri = f"runs:/{run.info.run_id}/model"
model_src = RunsArtifactRepository.get_underlying_uri(runs_uri)
mv = client.create_model_version(name, model_src, run.info.run_id, description=desc)
print("Name: {}".format(mv.name))
print("Version: {}".format(mv.version))
print("Description: {}".format(mv.description))
print("Status: {}".format(mv.status))
print("Stage: {}".format(mv.current_stage))

2023/01/09 18:42:39 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ge_chicago_taxi_tips, version 5


Name: ge_chicago_taxi_tips
Version: 5
Description: A new 55
Status: READY
Stage: None


### Visualize the version

In [66]:
f'click https://ml-registry.aiengineer.polytech.sandbox-atos.com/#/models/{name}'

'click https://ml-registry.aiengineer.polytech.sandbox-atos.com/#/models/ge_chicago_taxi_tips'

![versions](./images/versions.png)

### Pull the model from the registry to test some sample predictions

In [15]:
name

'ge_chicago_taxi_tips'

In [22]:
import mlflow.pyfunc

model_name = name
stage = None

modelfromRegistry = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

In [38]:
modelfromRegistry.predict(...)

array([ 1.443182  ,  0.13632879,  0.09012622, 10.60915   ,  1.9248669 ],
      dtype=float32)