Project-MONAI · wyli · May 7, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
@@ -64,6 +64,12 @@ class nnUNetV2Runner:  # noqa: N801
             - ``"nnUNet_trained_models"``
             - ``"dataset_name_or_id"``: Name or Integer ID of the dataset
             If an optional key is not specified, then the pipeline will use the default values.
+        trainer_class_name: the trainer class names offered by nnUNetV2 exhibit variations in training duration.
+            Default: "nnUNetTrainer". Other options: "nnUNetTrainer_Xepoch". X could be one of 1,5,10,20,50,100,
+            250,2000,4000,8000.
+        export_validation_probabilities: True to save softmax predictions from final validation as npz
+            files (in addition to predicted segmentations). Needed for finding the best ensemble.
+            Default: True.
         work_dir: working directory to save the intermediate and final results.
 
     Examples:
@@ -141,9 +147,17 @@ class nnUNetV2Runner:  # noqa: N801
 
     """
 
-    def __init__(self, input_config: Any, work_dir: str = "work_dir") -> None:
+    def __init__(
+        self,
+        input_config: Any,
+        trainer_class_name: str = "nnUNetTrainer",
+        work_dir: str = "work_dir",
+        export_validation_probabilities: bool = True,
+    ) -> None:
         self.input_info: dict = {}
         self.input_config_or_dict = input_config
+        self.trainer_class_name = trainer_class_name
+        self.export_validation_probabilities = export_validation_probabilities
         self.work_dir = work_dir
 
         if isinstance(self.input_config_or_dict, dict):
@@ -470,15 +484,16 @@ def plan_and_process(
         if not no_pp:
             self.preprocess(c, n_proc, overwrite_plans_name, verbose)
 
-    def train_single_model(self, config: Any, fold: int, gpu_id: int = 0, **kwargs: Any) -> None:
+    def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int = 0, **kwargs: Any) -> None:
         """
         Run the training on a single GPU with one specified configuration provided.
         Note: this will override the environment variable `CUDA_VISIBLE_DEVICES`.
 
         Args:
             config: configuration that should be trained. Examples: "2d", "3d_fullres", "3d_lowres".
             fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4.
-            gpu_id: an integer to select the device to use. Default: 0.
+            gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU
+                training (e.g., (0,1)). Default: 0.
         from nnunetv2.run.run_training import run_training
             kwargs: this optional parameter allows you to specify additional arguments in
                 ``nnunetv2.run.run_training.run_training``. Currently supported args are
@@ -489,25 +504,53 @@ def train_single_model(self, config: Any, fold: int, gpu_id: int = 0, **kwargs:
                     - use_compressed_data: True to use compressed data for training. Reading compressed data is much
                         more CPU and (potentially) RAM intensive and should only be used if you know what you are
                         doing. Default: False.
-                    - export_validation_probabilities: True to save softmax predictions from final validation as npz
-                        files (in addition to predicted segmentations). Needed for finding the best ensemble.
-                        Default: False.
                     - continue_training: continue training from latest checkpoint. Default: False.
                     - only_run_validation: True to run the validation only. Requires training to have finished.
                         Default: False.
                     - disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you
                         don't want to flood your hard drive with checkpoints. Default: False.
         """
-        os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
+        if "num_gpus" in kwargs:
+            kwargs.pop("num_gpus")
+            logger.warning("please use gpu_id to set the GPUs to use")
+
+        if isinstance(gpu_id, tuple) or isinstance(gpu_id, list):
+            if len(gpu_id) > 1:
+                gpu_ids_str = ""
+                for _i in range(len(gpu_id)):
+                    gpu_ids_str += f"{gpu_id[_i]},"
+                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids_str[:-1]
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id[0]}"
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
 
         from nnunetv2.run.run_training import run_training
 
-        run_training(dataset_name_or_id=self.dataset_name_or_id, configuration=config, fold=fold, **kwargs)
+        if isinstance(gpu_id, int) or len(gpu_id) == 1:
+            run_training(
+                dataset_name_or_id=self.dataset_name_or_id,
+                configuration=config,
+                fold=fold,
+                trainer_class_name=self.trainer_class_name,
+                export_validation_probabilities=self.export_validation_probabilities,
+                **kwargs,
+            )
+        else:
+            run_training(
+                dataset_name_or_id=self.dataset_name_or_id,
+                configuration=config,
+                fold=fold,
+                num_gpus=len(gpu_id),
+                trainer_class_name=self.trainer_class_name,
+                export_validation_probabilities=self.export_validation_probabilities,
+                **kwargs,
+            )
 
     def train(
         self,
         configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
-        device_ids: tuple | None = None,
+        device_ids: tuple | list | None = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -522,7 +565,6 @@ def train(
             kwargs: this optional parameter allows you to specify additional arguments defined in the
                 ``train_single_model`` method.
         """
-
         if device_ids is None:
             result = subprocess.run(["nvidia-smi", "--list-gpus"], stdout=subprocess.PIPE)
             output = result.stdout.decode("utf-8")
@@ -534,12 +576,12 @@ def train(
         else:
             for cfg in ensure_tuple(configs):
                 for _fold in range(self.num_folds):
-                    self.train_single_model(config=cfg, fold=_fold, **kwargs)
+                    self.train_single_model(config=cfg, fold=_fold, gpu_id=device_ids, **kwargs)
 
     def train_parallel_cmd(
         self,
         configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
-        device_ids: tuple | None = None,
+        device_ids: tuple | list | None = None,
         **kwargs: Any,
     ) -> list:
         """
@@ -548,7 +590,7 @@ def train_parallel_cmd(
         Args:
             configs: configurations that should be trained.
                 Default: ("2d", "3d_fullres", "3d_lowres", "3d_cascade_fullres").
-            device_ids: a tuple of GPU device IDs to use for the training. Default: None (all available GPUs).
+            device_ids: a tuple/list of GPU device IDs to use for the training. Default: None (all available GPUs).
             kwargs: this optional parameter allows you to specify additional arguments defined in the
                 ``train_single_model`` method.
         """
@@ -586,7 +628,9 @@ def train_parallel_cmd(
                         cmd = (
                             "python -m monai.apps.nnunet nnUNetV2Runner train_single_model "
                             + f"--input_config '{self.input_config_or_dict}' --work_dir '{self.work_dir}' "
-                            + f"--config '{_config}' --fold {_i} --gpu_id {the_device}"
+                            + f"--config '{_config}' --fold {_i} --gpu_id {the_device} "
+                            + f"--trainer_class_name {self.trainer_class_name} "
+                            + f"--export_validation_probabilities {self.export_validation_probabilities}"
                         )
                         for _key, _value in kwargs.items():
                             cmd += f" --{_key} {_value}"
@@ -597,7 +641,7 @@ def train_parallel_cmd(
     def train_parallel(
         self,
         configs: tuple | str = (M.N_3D_FULLRES, M.N_2D, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
-        device_ids: tuple | None = None,
+        device_ids: tuple | list | None = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -618,7 +662,7 @@ def train_parallel(
                 if not gpu_cmd:
                     continue
                 logger.info(
-                    f"\ntraining - stage {s + 1}:\n"
+                    f"training - stage {s + 1}:\n"
                     f"for gpu {gpu_id}, commands: {gpu_cmd}\n"
                     f"log '.txt' inside '{os.path.join(self.nnunet_results, self.dataset_name)}'"
                 )
@@ -628,6 +672,7 @@ def train_parallel(
                 if not stage[device_id]:
                     continue
                 cmd_str = "; ".join(stage[device_id])
+                logger.info(f"Current running command on GPU device {device_id}:\n{cmd_str}\n")
                 processes.append(subprocess.Popen(cmd_str, shell=True, stdout=subprocess.DEVNULL))
             # finish this stage first
             for p in processes:
@@ -665,7 +710,7 @@ def find_best_configuration(
         self,
         plans: tuple | str = "nnUNetPlans",
         configs: tuple | str = (M.N_2D, M.N_3D_FULLRES, M.N_3D_LOWRES, M.N_3D_CASCADE_FULLRES),
-        trainers: tuple | str = "nnUNetTrainer",
+        trainers: tuple | str | None = None,
         allow_ensembling: bool = True,
         num_processes: int = -1,
         overwrite: bool = True,
@@ -679,9 +724,9 @@ def find_best_configuration(
             plans: list of plan identifiers. Default: nnUNetPlans.
             configs: list of configurations. Default: ["2d", "3d_fullres", "3d_lowres", "3d_cascade_fullres"].
             trainers: list of trainers. Default: nnUNetTrainer.
-            allow_ensembling: Set this flag to enable ensembling.
+            allow_ensembling: set this flag to enable ensembling.
             num_processes: number of processes to use for ensembling, postprocessing, etc.
-            overwrite: If set we will overwrite already ensembled files etc. May speed up consecutive
+            overwrite: if set we will overwrite already ensembled files etc. May speed up consecutive
                 runs of this command (not recommended) at the risk of not updating outdated results.
             folds: folds to use. Default: (0, 1, 2, 3, 4).
             strict: a switch that triggers RunTimeError if the logging folder cannot be found. Default: False.
@@ -693,6 +738,9 @@ def find_best_configuration(
 
         configs = ensure_tuple(configs)
         plans = ensure_tuple(plans)
+
+        if trainers is None:
+            trainers = self.trainer_class_name
         trainers = ensure_tuple(trainers)
 
         models = dumb_trainer_config_plans_to_trained_models_dict(trainers, configs, plans)