Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 59 additions & 19 deletions monai/handlers/mlflow_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

Events, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Events")
mlflow, _ = optional_import("mlflow")
mlflow.entities, _ = optional_import("mlflow.entities")

if TYPE_CHECKING:
from ignite.engine import Engine
Expand Down Expand Up @@ -52,7 +53,7 @@ class MLFlowHandler:
Args:
tracking_uri: connects to a tracking URI. can also set the `MLFLOW_TRACKING_URI` environment
variable to have MLflow find a URI from there. in both cases, the URI can either be
a HTTP/HTTPS URI for a remote server, a database connection string, or a local path
an HTTP/HTTPS URI for a remote server, a database connection string, or a local path
to log data to a directory. The URI defaults to path `mlruns`.
for more details: https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.set_tracking_uri.
iteration_log: whether to log data to MLFlow when iteration completed, default to `True`.
Expand Down Expand Up @@ -109,9 +110,6 @@ def __init__(
optimizer_param_names: Union[str, Sequence[str]] = "lr",
close_on_complete: bool = False,
) -> None:
if tracking_uri is not None:
mlflow.set_tracking_uri(tracking_uri)

self.iteration_log = iteration_log
self.epoch_log = epoch_log
self.epoch_logger = epoch_logger
Expand All @@ -125,8 +123,10 @@ def __init__(
self.experiment_param = experiment_param
self.artifacts = ensure_tuple(artifacts)
self.optimizer_param_names = ensure_tuple(optimizer_param_names)
self.client = mlflow.MlflowClient()
self.client = mlflow.MlflowClient(tracking_uri=tracking_uri if tracking_uri else None)
self.close_on_complete = close_on_complete
self.experiment = None
self.cur_run = None

def _delete_exist_param_in_dict(self, param_dict: Dict) -> None:
"""
Expand All @@ -135,9 +135,11 @@ def _delete_exist_param_in_dict(self, param_dict: Dict) -> None:
Args:
param_dict: parameter dict to be logged to mlflow.
"""
if self.cur_run is None:
return

key_list = list(param_dict.keys())
cur_run = mlflow.active_run()
log_data = self.client.get_run(cur_run.info.run_id).data
log_data = self.client.get_run(self.cur_run.info.run_id).data
log_param_dict = log_data.params
for key in key_list:
if key in log_param_dict:
Expand Down Expand Up @@ -167,17 +169,52 @@ def start(self, engine: Engine) -> None:
Check MLFlow status and start if not active.

"""
mlflow.set_experiment(self.experiment_name)
if mlflow.active_run() is None:
self._set_experiment()
if not self.experiment:
raise ValueError(f"Failed to set experiment '{self.experiment_name}' as the active experiment")

if not self.cur_run:
run_name = f"run_{time.strftime('%Y%m%d_%H%M%S')}" if self.run_name is None else self.run_name
mlflow.start_run(run_name=run_name)
runs = self.client.search_runs(self.experiment.experiment_id)
runs = [r for r in runs if r.info.run_name == run_name or not self.run_name]
if runs:
self.cur_run = self.client.get_run(runs[-1].info.run_id) # pick latest active run
else:
self.cur_run = self.client.create_run(experiment_id=self.experiment.experiment_id, run_name=run_name)

if self.experiment_param:
mlflow.log_params(self.experiment_param)
self._log_params(self.experiment_param)

attrs = {attr: getattr(engine.state, attr, None) for attr in self.default_tracking_params}
self._delete_exist_param_in_dict(attrs)
mlflow.log_params(attrs)
self._log_params(attrs)

def _set_experiment(self):
experiment = self.experiment
if not experiment:
experiment = self.client.get_experiment_by_name(self.experiment_name)
if not experiment:
experiment_id = self.client.create_experiment(self.experiment_name)
experiment = self.client.get_experiment(experiment_id)

if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE:
raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment")
self.experiment = experiment

def _log_params(self, params: Dict[str, Any]) -> None:
if not self.cur_run:
raise ValueError("Current Run is not Active to log params")
params_arr = [mlflow.entities.Param(key, str(value)) for key, value in params.items()]
self.client.log_batch(run_id=self.cur_run.info.run_id, metrics=[], params=params_arr, tags=[])

def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
if not self.cur_run:
raise ValueError("Current Run is not Active to log metrics")

run_id = self.cur_run.info.run_id
timestamp = int(time.time() * 1000)
metrics_arr = [mlflow.entities.Metric(key, value, timestamp, step or 0) for key, value in metrics.items()]
self.client.log_batch(run_id=run_id, metrics=metrics_arr, params=[], tags=[])

def _parse_artifacts(self):
"""
Expand All @@ -202,17 +239,20 @@ def complete(self) -> None:
"""
Handler for train or validation/evaluation completed Event.
"""
if self.artifacts:
if self.artifacts and self.cur_run:
artifact_list = self._parse_artifacts()
for artifact in artifact_list:
mlflow.log_artifact(artifact)
self.client.log_artifact(self.cur_run.info.run_id, artifact)

def close(self) -> None:
"""
Stop current running logger of MLFlow.

"""
mlflow.end_run()
if self.cur_run:
status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED)
self.client.set_terminated(self.cur_run.info.run_id, status)
self.cur_run = None

def epoch_completed(self, engine: Engine) -> None:
"""
Expand Down Expand Up @@ -257,11 +297,11 @@ def _default_epoch_log(self, engine: Engine) -> None:
return

current_epoch = self.global_epoch_transform(engine.state.epoch)
mlflow.log_metrics(log_dict, step=current_epoch)
self._log_metrics(log_dict, step=current_epoch)

if self.state_attributes is not None:
attrs = {attr: getattr(engine.state, attr, None) for attr in self.state_attributes}
mlflow.log_metrics(attrs, step=current_epoch)
self._log_metrics(attrs, step=current_epoch)

def _default_iteration_log(self, engine: Engine) -> None:
"""
Expand All @@ -281,7 +321,7 @@ def _default_iteration_log(self, engine: Engine) -> None:
if not isinstance(loss, dict):
loss = {self.tag_name: loss.item() if isinstance(loss, torch.Tensor) else loss}

mlflow.log_metrics(loss, step=engine.state.iteration)
self._log_metrics(loss, step=engine.state.iteration)

# If there is optimizer attr in engine, then record parameters specified in init function.
if hasattr(engine, "optimizer"):
Expand All @@ -291,4 +331,4 @@ def _default_iteration_log(self, engine: Engine) -> None:
f"{param_name} group_{i}": float(param_group[param_name])
for i, param_group in enumerate(cur_optimizer.param_groups)
}
mlflow.log_metrics(params, step=engine.state.iteration)
self._log_metrics(params, step=engine.state.iteration)