Skip to content
Merged
86 changes: 67 additions & 19 deletions monai/apps/nnunet/nnunetv2_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ class nnUNetV2Runner: # noqa: N801
- ``"nnUNet_trained_models"``
- ``"dataset_name_or_id"``: Name or Integer ID of the dataset
If an optional key is not specified, then the pipeline will use the default values.
trainer_class_name: the trainer class names offered by nnUNetV2 exhibit variations in training duration.
Default: "nnUNetTrainer". Other options: "nnUNetTrainer_Xepoch". X could be one of 1,5,10,20,50,100,
250,2000,4000,8000.
export_validation_probabilities: True to save softmax predictions from final validation as npz
files (in addition to predicted segmentations). Needed for finding the best ensemble.
Default: True.
work_dir: working directory to save the intermediate and final results.

Examples:
Expand Down Expand Up @@ -141,9 +147,17 @@ class nnUNetV2Runner: # noqa: N801

"""

def __init__(self, input_config: Any, work_dir: str = "work_dir") -> None:
def __init__(
self,
input_config: Any,
trainer_class_name: str = "nnUNetTrainer",
work_dir: str = "work_dir",
export_validation_probabilities: bool = True,
) -> None:
self.input_info: dict = {}
self.input_config_or_dict = input_config
self.trainer_class_name = trainer_class_name
self.export_validation_probabilities = export_validation_probabilities
self.work_dir = work_dir

if isinstance(self.input_config_or_dict, dict):
Expand Down Expand Up @@ -470,15 +484,16 @@ def plan_and_process(
if not no_pp:
self.preprocess(c, n_proc, overwrite_plans_name, verbose)

def train_single_model(self, config: Any, fold: int, gpu_id: int = 0, **kwargs: Any) -> None:
def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int = 0, **kwargs: Any) -> None:
Comment thread
mingxin-zheng marked this conversation as resolved.
"""
Run the training on a single GPU with one specified configuration provided.
Note: this will override the environment variable `CUDA_VISIBLE_DEVICES`.

Args:
config: configuration that should be trained. Examples: "2d", "3d_fullres", "3d_lowres".
fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4.
gpu_id: an integer to select the device to use. Default: 0.
gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU
training (e.g., (0,1)). Default: 0.
from nnunetv2.run.run_training import run_training
kwargs: this optional parameter allows you to specify additional arguments in
``nnunetv2.run.run_training.run_training``. Currently supported args are
Expand All @@ -489,25 +504,53 @@ def train_single_model(self, config: Any, fold: int, gpu_id: int = 0, **kwargs:
- use_compressed_data: True to use compressed data for training. Reading compressed data is much
more CPU and (potentially) RAM intensive and should only be used if you know what you are
doing. Default: False.
- export_validation_probabilities: True to save softmax predictions from final validation as npz
files (in addition to predicted segmentations). Needed for finding the best ensemble.
Default: False.
- continue_training: continue training from latest checkpoint. Default: False.
- only_run_validation: True to run the validation only. Requires training to have finished.
Default: False.
- disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you
don't want to flood your hard drive with checkpoints. Default: False.
"""
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
if "num_gpus" in kwargs:
kwargs.pop("num_gpus")
logger.warning("please use gpu_id to set the GPUs to use")

if isinstance(gpu_id, tuple) or isinstance(gpu_id, list):
if len(gpu_id) > 1:
gpu_ids_str = ""
for _i in range(len(gpu_id)):
gpu_ids_str += f"{gpu_id[_i]},"
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids_str[:-1]
Comment thread
mingxin-zheng marked this conversation as resolved.
else:
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id[0]}"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"

from nnunetv2.run.run_training import run_training

run_training(dataset_name_or_id=self.dataset_name_or_id, configuration=config, fold=fold, **kwargs)
if isinstance(gpu_id, int) or len(gpu_id) == 1:
run_training(
dataset_name_or_id=self.dataset_name_or_id,
configuration=config,
fold=fold,
trainer_class_name=self.trainer_class_name,
export_validation_probabilities=self.export_validation_probabilities,
**kwargs,
)
else:
run_training(
dataset_name_or_id=self.dataset_name_or_id,
configuration=config,
fold=fold,
num_gpus=len(gpu_id),
trainer_class_name=self.trainer_class_name,
export_validation_probabilities=self.export_validation_probabilities,
**kwargs,
Comment thread
mingxin-zheng marked this conversation as resolved.
)

def train(
self,
configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
device_ids: tuple | None = None,
device_ids: tuple | list | None = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -522,7 +565,6 @@ def train(
kwargs: this optional parameter allows you to specify additional arguments defined in the
``train_single_model`` method.
"""

if device_ids is None:
result = subprocess.run(["nvidia-smi", "--list-gpus"], stdout=subprocess.PIPE)
output = result.stdout.decode("utf-8")
Expand All @@ -534,12 +576,12 @@ def train(
else:
for cfg in ensure_tuple(configs):
for _fold in range(self.num_folds):
self.train_single_model(config=cfg, fold=_fold, **kwargs)
self.train_single_model(config=cfg, fold=_fold, gpu_id=device_ids, **kwargs)

def train_parallel_cmd(
self,
configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
device_ids: tuple | None = None,
device_ids: tuple | list | None = None,
**kwargs: Any,
) -> list:
"""
Expand All @@ -548,7 +590,7 @@ def train_parallel_cmd(
Args:
configs: configurations that should be trained.
Default: ("2d", "3d_fullres", "3d_lowres", "3d_cascade_fullres").
device_ids: a tuple of GPU device IDs to use for the training. Default: None (all available GPUs).
device_ids: a tuple/list of GPU device IDs to use for the training. Default: None (all available GPUs).
kwargs: this optional parameter allows you to specify additional arguments defined in the
``train_single_model`` method.
"""
Expand Down Expand Up @@ -586,7 +628,9 @@ def train_parallel_cmd(
cmd = (
"python -m monai.apps.nnunet nnUNetV2Runner train_single_model "
+ f"--input_config '{self.input_config_or_dict}' --work_dir '{self.work_dir}' "
+ f"--config '{_config}' --fold {_i} --gpu_id {the_device}"
+ f"--config '{_config}' --fold {_i} --gpu_id {the_device} "
+ f"--trainer_class_name {self.trainer_class_name} "
+ f"--export_validation_probabilities {self.export_validation_probabilities}"
)
for _key, _value in kwargs.items():
cmd += f" --{_key} {_value}"
Expand All @@ -597,7 +641,7 @@ def train_parallel_cmd(
def train_parallel(
self,
configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
device_ids: tuple | None = None,
device_ids: tuple | list | None = None,
**kwargs: Any,
) -> None:
"""
Expand All @@ -618,7 +662,7 @@ def train_parallel(
if not gpu_cmd:
continue
logger.info(
f"\ntraining - stage {s + 1}:\n"
f"training - stage {s + 1}:\n"
f"for gpu {gpu_id}, commands: {gpu_cmd}\n"
f"log '.txt' inside '{os.path.join(self.nnunet_results, self.dataset_name)}'"
)
Expand All @@ -628,6 +672,7 @@ def train_parallel(
if not stage[device_id]:
continue
cmd_str = "; ".join(stage[device_id])
logger.info(f"Current running command on GPU device {device_id}:\n{cmd_str}\n")
processes.append(subprocess.Popen(cmd_str, shell=True, stdout=subprocess.DEVNULL))
# finish this stage first
for p in processes:
Expand Down Expand Up @@ -665,7 +710,7 @@ def find_best_configuration(
self,
plans: tuple | str = "nnUNetPlans",
configs: tuple | str = (M.N_2D, M.N_3D_FULLRES, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
trainers: tuple | str = "nnUNetTrainer",
trainers: tuple | str | None = None,
allow_ensembling: bool = True,
num_processes: int = -1,
overwrite: bool = True,
Expand All @@ -679,9 +724,9 @@ def find_best_configuration(
plans: list of plan identifiers. Default: nnUNetPlans.
configs: list of configurations. Default: ["2d", "3d_fullres", "3d_lowres", "3d_cascade_fullres"].
trainers: list of trainers. Default: nnUNetTrainer.
allow_ensembling: Set this flag to enable ensembling.
allow_ensembling: set this flag to enable ensembling.
num_processes: number of processes to use for ensembling, postprocessing, etc.
overwrite: If set we will overwrite already ensembled files etc. May speed up consecutive
overwrite: if set we will overwrite already ensembled files etc. May speed up consecutive
runs of this command (not recommended) at the risk of not updating outdated results.
folds: folds to use. Default: (0, 1, 2, 3, 4).
strict: a switch that triggers RunTimeError if the logging folder cannot be found. Default: False.
Expand All @@ -693,6 +738,9 @@ def find_best_configuration(

configs = ensure_tuple(configs)
plans = ensure_tuple(plans)

if trainers is None:
trainers = self.trainer_class_name
trainers = ensure_tuple(trainers)

models = dumb_trainer_config_plans_to_trained_models_dict(trainers, configs, plans)
Expand Down