diff --git a/README.md b/README.md index 067501860..5ebbec6f0 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,10 @@ python -m torch.distributed.launch --nproc_per_node 8 --master_port 9527 train_d Under construction. +## Loggers + +Check the [YOLOv9 loggers documentation](utils/loggers/README.md). + ## Citation diff --git a/utils/loggers/README.md b/utils/loggers/README.md new file mode 100644 index 000000000..413e8239c --- /dev/null +++ b/utils/loggers/README.md @@ -0,0 +1,59 @@ +# Loggers + +## Supported platforms + +* [TensorBoard](https://www.tensorflow.org/tensorboard) +* [Weights & Biases](https://wandb.ai/) +* [ClearML](https://clear.ml/) +* [Comet](https://www.comet.com/) +* [MLflow](https://mlflow.org/) + +## How to use ? + +### TensorBoard + +TODO + +### Weights & Biases + +TODO + +### ClearML + +TODO + +### Comet + +TODO + +### MLflow + +MLflow is an open-source platform for monitoring and managing machine learning experiments. + +1. __Prerequisites__ + +Make sure you have installed the MLflow library: + + pip install mlflow + +2. __Serveur MLflow__ + +Launch your MLflow server. You can run it with the following command: + + mlflow server --backend-store-uri mlflow_server + +This will start a local server at http://127.0.0.1:5000 by default and save all mlflow logs to the `mlflow_server` directory at the location of the command execution. + +To cut all instances of MLflow, you can run this command: + + ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9 + +3. __MLflow parameters__ + +Set your server address in the `MLFLOW_TRACKING_URI` environment variable. If the address is not provided, a warning will be raised and the run will not be recorded. + +Set the name of your experiment in the `MLFLOW_EXPERIMENT_NAME` environment variable. If no name is provided, the project name (--project of train.py) will be set by default. + +Define the name of your run in the `MLFLOW_RUN` environment variable. If no name is provided, the run name (--name of train.py) will be set by default. + +After that, your training sessions will be saved in your MLflow server! diff --git a/utils/loggers/__init__.py b/utils/loggers/__init__.py index 8fc8377ab..120ae21a4 100644 --- a/utils/loggers/__init__.py +++ b/utils/loggers/__init__.py @@ -12,7 +12,7 @@ from utils.plots import plot_images, plot_labels, plot_results from utils.torch_utils import de_parallel -LOGGERS = ('csv', 'tb', 'wandb', 'clearml', 'comet') # *.csv, TensorBoard, Weights & Biases, ClearML +LOGGERS = ('csv', 'tb', 'wandb', 'clearml', 'comet', 'mlflow') # *.csv, TensorBoard, Weights & Biases, ClearML, MLflow RANK = int(os.getenv('RANK', -1)) try: @@ -48,6 +48,14 @@ except (ModuleNotFoundError, ImportError, AssertionError): comet_ml = None +try: + import mlflow + + assert hasattr(mlflow, '__version__') # verify package import not local dir + from utils.loggers.mlflow import MLflowLogger +except (ImportError, AssertionError): + mlflow = None + class Loggers(): # YOLO Loggers class @@ -91,6 +99,10 @@ def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None, prefix = colorstr('Comet: ') s = f"{prefix}run 'pip install comet_ml' to automatically track and visualize YOLO 🚀 runs in Comet" self.logger.info(s) + if not mlflow: + prefix = colorstr('MLflow: ') + s = f"{prefix}run 'pip install mlflow' to automatically track and visualize YOLO 🚀 runs in MLflow" + self.logger.info(s) # TensorBoard s = self.save_dir if 'tb' in self.include and not self.opt.evolve: @@ -129,6 +141,12 @@ def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None, else: self.comet_logger = None + # MLflow + if mlflow and 'mlflow' in self.include: + self.mlflow = MLflowLogger(self.opt, self.hyp) + else: + self.mlflow = None + @property def remote_dataset(self): # Get data_dict if custom dataset artifact link is provided @@ -139,6 +157,8 @@ def remote_dataset(self): data_dict = self.wandb.data_dict if self.comet_logger: data_dict = self.comet_logger.data_dict + if self.mlflow: + data_dict = self.mlflow.data_dict return data_dict @@ -161,6 +181,8 @@ def on_pretrain_routine_end(self, labels, names): # pass # ClearML saves these images automatically using hooks if self.comet_logger: self.comet_logger.on_pretrain_routine_end(paths) + if self.mlflow: + self.mlflow.on_pretrain_routine_end(paths) def on_train_batch_end(self, model, ni, imgs, targets, paths, vals): log_dict = dict(zip(self.keys[0:3], vals)) @@ -250,6 +272,9 @@ def on_fit_epoch_end(self, vals, epoch, best_fitness, fi): if self.comet_logger: self.comet_logger.on_fit_epoch_end(x, epoch=epoch) + if self.mlflow: + self.mlflow.on_fit_epoch_end(x, epoch=epoch) + def on_model_save(self, last, epoch, final_epoch, best_fitness, fi): # Callback runs on model save event if (epoch + 1) % self.opt.save_period == 0 and not final_epoch and self.opt.save_period != -1: @@ -263,6 +288,9 @@ def on_model_save(self, last, epoch, final_epoch, best_fitness, fi): if self.comet_logger: self.comet_logger.on_model_save(last, epoch, final_epoch, best_fitness, fi) + if self.mlflow and not final_epoch: + self.mlflow.on_model_save(last) + def on_train_end(self, last, best, epoch, results): # Callback runs on training end, i.e. saving best model if self.plots: @@ -295,6 +323,9 @@ def on_train_end(self, last, best, epoch, results): final_results = dict(zip(self.keys[3:10], results)) self.comet_logger.on_train_end(files, self.save_dir, last, best, epoch, final_results) + if self.mlflow: + self.mlflow.on_train_end(self.save_dir) + def on_params_update(self, params: dict): # Update hyperparams or configs of the experiment if self.wandb: diff --git a/utils/loggers/mlflow/__init__.py b/utils/loggers/mlflow/__init__.py new file mode 100644 index 000000000..32fe7d36b --- /dev/null +++ b/utils/loggers/mlflow/__init__.py @@ -0,0 +1,79 @@ +import os +from utils.general import LOGGER, colorstr +from pathlib import Path + +try: + import mlflow +except (ModuleNotFoundError, ImportError): + mlflow = None + +PREFIX = colorstr("MLflow: ") +SANITIZE = lambda x: {k.replace(':', '_').replace(';', '_'): v for k, v in x.items()} + + +class MLflowLogger: + """Log metrics, parameters, models and much more with MLflow""" + + + def __init__(self, opt, hyp) -> None: + + self.mlflow = mlflow + self.data_dict = None + self.opt = opt + self.hyp = hyp + + + def on_pretrain_routine_end(self, paths): + + global mlflow + + uri = os.environ.get("MLFLOW_TRACKING_URI") + LOGGER.info(f"{PREFIX} tracking uri: {uri}") + mlflow.set_tracking_uri(uri) + experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or self.opt.save_dir.split('/')[-2] + run_name = os.environ.get("MLFLOW_RUN") or self.opt.save_dir.split('/')[-1] + mlflow.set_experiment(experiment_name) + mlflow.pytorch.autolog() + + try: + active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name) + LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}") + + # Don't save hyps from opt directly + mlflow.log_params({f"param/{key}": value for key, value in vars(self.opt).items() if key != 'hyp'}) + mlflow.log_params({f"hyp/{key}": value for key, value in self.hyp.items()}) + + except Exception as e: + LOGGER.warning(f"{PREFIX}WARNING ⚠️ Failed to initialize: {e}\n" f"{PREFIX}WARNING ⚠️ Not tracking this run") + self.mlflow = None + + + for path in paths: + mlflow.log_artifact(str(path)) + + + def on_fit_epoch_end(self, vals, epoch): + """Log training metrics at the end of each fit epoch to MLflow.""" + + if self.mlflow: + mlflow.log_metrics(metrics=SANITIZE(vals), step=epoch) + + + def on_model_save(self, last): + if self.mlflow: + mlflow.log_artifacts(os.path.dirname(last), artifact_path="weights") + + + def on_train_end(self, save_dir): + """Log model artifacts at the end of the training.""" + + if self.mlflow: + mlflow.log_artifacts(save_dir) + + mlflow.end_run() + + LOGGER.info( + f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n" + ) + +