Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MLflow logger #87

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ python -m torch.distributed.launch --nproc_per_node 8 --master_port 9527 train_d

Under construction.

## Loggers

Check the [YOLOv9 loggers documentation](utils/loggers/README.md).


## Citation

Expand Down
59 changes: 59 additions & 0 deletions utils/loggers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Loggers

## Supported platforms

* [TensorBoard](https://www.tensorflow.org/tensorboard)
* [Weights & Biases](https://wandb.ai/)
* [ClearML](https://clear.ml/)
* [Comet](https://www.comet.com/)
* [MLflow](https://mlflow.org/)

## How to use ?

### TensorBoard

TODO

### Weights & Biases

TODO

### ClearML

TODO

### Comet

TODO

### MLflow

MLflow is an open-source platform for monitoring and managing machine learning experiments.

1. __Prerequisites__

Make sure you have installed the MLflow library:

pip install mlflow

2. __Serveur MLflow__

Launch your MLflow server. You can run it with the following command:

mlflow server --backend-store-uri mlflow_server

This will start a local server at http://127.0.0.1:5000 by default and save all mlflow logs to the `mlflow_server` directory at the location of the command execution.

To cut all instances of MLflow, you can run this command:

ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9

3. __MLflow parameters__

Set your server address in the `MLFLOW_TRACKING_URI` environment variable. If the address is not provided, a warning will be raised and the run will not be recorded.

Set the name of your experiment in the `MLFLOW_EXPERIMENT_NAME` environment variable. If no name is provided, the project name (--project of train.py) will be set by default.

Define the name of your run in the `MLFLOW_RUN` environment variable. If no name is provided, the run name (--name of train.py) will be set by default.

After that, your training sessions will be saved in your MLflow server!
33 changes: 32 additions & 1 deletion utils/loggers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from utils.plots import plot_images, plot_labels, plot_results
from utils.torch_utils import de_parallel

LOGGERS = ('csv', 'tb', 'wandb', 'clearml', 'comet') # *.csv, TensorBoard, Weights & Biases, ClearML
LOGGERS = ('csv', 'tb', 'wandb', 'clearml', 'comet', 'mlflow') # *.csv, TensorBoard, Weights & Biases, ClearML, MLflow
RANK = int(os.getenv('RANK', -1))

try:
Expand Down Expand Up @@ -48,6 +48,14 @@
except (ModuleNotFoundError, ImportError, AssertionError):
comet_ml = None

try:
import mlflow

assert hasattr(mlflow, '__version__') # verify package import not local dir
from utils.loggers.mlflow import MLflowLogger
except (ImportError, AssertionError):
mlflow = None


class Loggers():
# YOLO Loggers class
Expand Down Expand Up @@ -91,6 +99,10 @@ def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None,
prefix = colorstr('Comet: ')
s = f"{prefix}run 'pip install comet_ml' to automatically track and visualize YOLO 🚀 runs in Comet"
self.logger.info(s)
if not mlflow:
prefix = colorstr('MLflow: ')
s = f"{prefix}run 'pip install mlflow' to automatically track and visualize YOLO 🚀 runs in MLflow"
self.logger.info(s)
# TensorBoard
s = self.save_dir
if 'tb' in self.include and not self.opt.evolve:
Expand Down Expand Up @@ -129,6 +141,12 @@ def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None,
else:
self.comet_logger = None

# MLflow
if mlflow and 'mlflow' in self.include:
self.mlflow = MLflowLogger(self.opt, self.hyp)
else:
self.mlflow = None

@property
def remote_dataset(self):
# Get data_dict if custom dataset artifact link is provided
Expand All @@ -139,6 +157,8 @@ def remote_dataset(self):
data_dict = self.wandb.data_dict
if self.comet_logger:
data_dict = self.comet_logger.data_dict
if self.mlflow:
data_dict = self.mlflow.data_dict

return data_dict

Expand All @@ -161,6 +181,8 @@ def on_pretrain_routine_end(self, labels, names):
# pass # ClearML saves these images automatically using hooks
if self.comet_logger:
self.comet_logger.on_pretrain_routine_end(paths)
if self.mlflow:
self.mlflow.on_pretrain_routine_end(paths)

def on_train_batch_end(self, model, ni, imgs, targets, paths, vals):
log_dict = dict(zip(self.keys[0:3], vals))
Expand Down Expand Up @@ -250,6 +272,9 @@ def on_fit_epoch_end(self, vals, epoch, best_fitness, fi):
if self.comet_logger:
self.comet_logger.on_fit_epoch_end(x, epoch=epoch)

if self.mlflow:
self.mlflow.on_fit_epoch_end(x, epoch=epoch)

def on_model_save(self, last, epoch, final_epoch, best_fitness, fi):
# Callback runs on model save event
if (epoch + 1) % self.opt.save_period == 0 and not final_epoch and self.opt.save_period != -1:
Expand All @@ -263,6 +288,9 @@ def on_model_save(self, last, epoch, final_epoch, best_fitness, fi):
if self.comet_logger:
self.comet_logger.on_model_save(last, epoch, final_epoch, best_fitness, fi)

if self.mlflow and not final_epoch:
self.mlflow.on_model_save(last)

def on_train_end(self, last, best, epoch, results):
# Callback runs on training end, i.e. saving best model
if self.plots:
Expand Down Expand Up @@ -295,6 +323,9 @@ def on_train_end(self, last, best, epoch, results):
final_results = dict(zip(self.keys[3:10], results))
self.comet_logger.on_train_end(files, self.save_dir, last, best, epoch, final_results)

if self.mlflow:
self.mlflow.on_train_end(self.save_dir)

def on_params_update(self, params: dict):
# Update hyperparams or configs of the experiment
if self.wandb:
Expand Down
79 changes: 79 additions & 0 deletions utils/loggers/mlflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from utils.general import LOGGER, colorstr
from pathlib import Path

try:
import mlflow
except (ModuleNotFoundError, ImportError):
mlflow = None

PREFIX = colorstr("MLflow: ")
SANITIZE = lambda x: {k.replace(':', '_').replace(';', '_'): v for k, v in x.items()}


class MLflowLogger:
"""Log metrics, parameters, models and much more with MLflow"""


def __init__(self, opt, hyp) -> None:

self.mlflow = mlflow
self.data_dict = None
self.opt = opt
self.hyp = hyp


def on_pretrain_routine_end(self, paths):

global mlflow

uri = os.environ.get("MLFLOW_TRACKING_URI")
LOGGER.info(f"{PREFIX} tracking uri: {uri}")
mlflow.set_tracking_uri(uri)
experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or self.opt.save_dir.split('/')[-2]
run_name = os.environ.get("MLFLOW_RUN") or self.opt.save_dir.split('/')[-1]
mlflow.set_experiment(experiment_name)
mlflow.pytorch.autolog()

try:
active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")

# Don't save hyps from opt directly
mlflow.log_params({f"param/{key}": value for key, value in vars(self.opt).items() if key != 'hyp'})
mlflow.log_params({f"hyp/{key}": value for key, value in self.hyp.items()})

except Exception as e:
LOGGER.warning(f"{PREFIX}WARNING ⚠️ Failed to initialize: {e}\n" f"{PREFIX}WARNING ⚠️ Not tracking this run")
self.mlflow = None


for path in paths:
mlflow.log_artifact(str(path))


def on_fit_epoch_end(self, vals, epoch):
"""Log training metrics at the end of each fit epoch to MLflow."""

if self.mlflow:
mlflow.log_metrics(metrics=SANITIZE(vals), step=epoch)


def on_model_save(self, last):
if self.mlflow:
mlflow.log_artifacts(os.path.dirname(last), artifact_path="weights")


def on_train_end(self, save_dir):
"""Log model artifacts at the end of the training."""

if self.mlflow:
mlflow.log_artifacts(save_dir)

mlflow.end_run()

LOGGER.info(
f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n"
)