From 3002c675349b421ffb421dd6220e5556280b0198 Mon Sep 17 00:00:00 2001 From: myron Date: Tue, 22 Nov 2022 18:12:55 -0800 Subject: [PATCH 1/8] cachedataset fix Signed-off-by: myron --- monai/data/dataloader.py | 2 +- monai/data/dataset.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/monai/data/dataloader.py b/monai/data/dataloader.py index 53670046d8..ed9599eab8 100644 --- a/monai/data/dataloader.py +++ b/monai/data/dataloader.py @@ -83,7 +83,7 @@ def __init__(self, dataset: Dataset, num_workers: int = 0, **kwargs) -> None: # disable unnecessary multiprocessing caching from monai.data.dataset import CacheDataset # avoid circular import - if isinstance(dataset, CacheDataset) and dataset.runtime_cache: + if isinstance(dataset, CacheDataset): dataset.disable_share_memory_cache() _g.manual_seed(init_seed) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 7a1b7f5eb9..0ca4b3daf4 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -852,7 +852,8 @@ def disable_share_memory_cache(self): Because multiprocessing ProxyList is not supported for the GPU caching, may need to explicitly diasble it. """ - self._cache = list(self._cache) + if self.runtime_cache and not dist.is_initialized(): + self._cache = list(self._cache) def _fill_cache(self, indices=None) -> List: """ From 33d41ef03065053d309a68502e356ff51245c5b4 Mon Sep 17 00:00:00 2001 From: myron Date: Wed, 23 Nov 2022 12:23:29 -0800 Subject: [PATCH 2/8] doc string Signed-off-by: myron --- monai/data/dataset.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 0ca4b3daf4..7b10d3864d 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -763,7 +763,7 @@ def __init__( will take the minimum of (cache_num, data_length x cache_rate, data_length). num_workers: the number of worker threads if computing cache in the initialization. If num_workers is None then the number returned by os.cpu_count() is used. - If a value less than 1 is speficied, 1 will be used instead. + If a value less than 1 is specified, 1 will be used instead. progress: whether to display a progress bar. copy_cache: whether to `deepcopy` the cache content before applying the random transforms, default to `True`. if the random transforms don't modify the cached content @@ -778,14 +778,12 @@ def __init__( hash_func: if `hash_as_key`, a callable to compute hash from data items to be cached. defaults to `monai.data.utils.pickle_hashing`. runtime_cache: whether to compute cache at the runtime, default to `False` to prepare - the cache content at initializaiton, if `True`, it will cache during the first epoch + the cache content at initialization, if `True`, it will cache during the first epoch of model training, so it can start the first mini-batch earlier. please note that: 1. when using this option in multi-gpu distributed training, `torch.cuda.set_device()` must be called before initializing this class. - 2. to execute `runtime cache` on GPU memory, must co-work with - `monai.data.DataLoader`, and can't work with `monai.data.DistributedSampler` - as GPU Tensor usually can't be shared in the multiprocessing context. - (try ``cache_dataset.disable_share_memory_cache()`` in case of GPU caching issues.) + 2. if caching data that is in GPU memory during multi-gpu distributed training, this option + should not be used, since the underlying shared cache only works for CPU shared memory. """ if not isinstance(transform, Compose): From 10db196a3d419e0442ff45306b5336d80d97db1a Mon Sep 17 00:00:00 2001 From: myron Date: Wed, 23 Nov 2022 13:29:03 -0800 Subject: [PATCH 3/8] updating to _is_dist Signed-off-by: myron --- monai/data/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 7b10d3864d..9f5ee71740 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -825,7 +825,7 @@ def _compute_cache(indices=None): cache = Manager().list([None for _ in range(self.cache_num)]) if self._is_dist: obj_list = [cache] - # broadcast the ProxyList to all the ranks, then share the same cache content at runtime + # broadcast the ListProxy to all the ranks, then share the same cache content at runtime dist.broadcast_object_list(obj_list, src=0) cache = obj_list[0] else: @@ -846,11 +846,11 @@ def _compute_cache(indices=None): def disable_share_memory_cache(self): """ - If the cache content is multiprocessing share memory list, convert it to a regular ptython list. - Because multiprocessing ProxyList is not supported for the GPU caching, may need to explicitly diasble it. + If the cache content is a multiprocessing shared memory ListProxy, convert it to a regular python list. + Because multiprocessing ListProxy is not supported for the GPU caching, explicitly disable it. """ - if self.runtime_cache and not dist.is_initialized(): + if self.runtime_cache and not self._is_dist: self._cache = list(self._cache) def _fill_cache(self, indices=None) -> List: From 76203a2ba15c42e8ce31478f35b602d7526ed9ea Mon Sep 17 00:00:00 2001 From: myron Date: Fri, 25 Nov 2022 20:07:29 -0800 Subject: [PATCH 4/8] adding back doc string Signed-off-by: myron --- monai/data/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 9f5ee71740..a8d18f1c97 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -784,6 +784,11 @@ def __init__( `torch.cuda.set_device()` must be called before initializing this class. 2. if caching data that is in GPU memory during multi-gpu distributed training, this option should not be used, since the underlying shared cache only works for CPU shared memory. + 3. to execute `runtime cache` on GPU memory, must co-work with + `monai.data.DataLoader`, and can't work with `monai.data.DistributedSampler` + as GPU Tensor usually can't be shared in the multiprocessing context. + (try ``cache_dataset.disable_share_memory_cache()`` in case of GPU caching issues.) + """ if not isinstance(transform, Compose): From 4e9a2bd58939cc2e6a3562d11772e362947882cd Mon Sep 17 00:00:00 2001 From: myron Date: Fri, 25 Nov 2022 20:13:00 -0800 Subject: [PATCH 5/8] update test Signed-off-by: myron --- tests/test_integration_fast_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration_fast_train.py b/tests/test_integration_fast_train.py index bb50ddf7b6..de862d258c 100644 --- a/tests/test_integration_fast_train.py +++ b/tests/test_integration_fast_train.py @@ -144,7 +144,7 @@ def test_train_timing(self): # set CacheDataset, ThreadDataLoader and DiceCE loss for MONAI fast training train_ds = CacheDataset(data=train_files, transform=train_transforms, cache_rate=1.0, num_workers=8) - val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=True) + val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=False) # disable multi-workers because `ThreadDataLoader` works with multi-threads train_loader = ThreadDataLoader(train_ds, num_workers=0, batch_size=4, shuffle=True) val_loader = ThreadDataLoader(val_ds, num_workers=0, batch_size=1) From d8393b75006cd3b5acdf0f5a54da611009459fce Mon Sep 17 00:00:00 2001 From: myron Date: Mon, 28 Nov 2022 07:56:30 -0800 Subject: [PATCH 6/8] updated according to comments Signed-off-by: myron --- monai/data/dataset.py | 7 ++++++- tests/test_integration_fast_train.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index a8d18f1c97..8dc29d539d 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -855,8 +855,13 @@ def disable_share_memory_cache(self): Because multiprocessing ListProxy is not supported for the GPU caching, explicitly disable it. """ - if self.runtime_cache and not self._is_dist: + if not self._is_dist: self._cache = list(self._cache) + elif self.runtime_cache: + warnings.warn( + "Unable to disable shared cache in DDP, when runtime_cache==True." + "Please use runtime_cache=False option to explicitly not use the shared cache." + ) def _fill_cache(self, indices=None) -> List: """ diff --git a/tests/test_integration_fast_train.py b/tests/test_integration_fast_train.py index de862d258c..bb50ddf7b6 100644 --- a/tests/test_integration_fast_train.py +++ b/tests/test_integration_fast_train.py @@ -144,7 +144,7 @@ def test_train_timing(self): # set CacheDataset, ThreadDataLoader and DiceCE loss for MONAI fast training train_ds = CacheDataset(data=train_files, transform=train_transforms, cache_rate=1.0, num_workers=8) - val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=False) + val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=True) # disable multi-workers because `ThreadDataLoader` works with multi-threads train_loader = ThreadDataLoader(train_ds, num_workers=0, batch_size=4, shuffle=True) val_loader = ThreadDataLoader(val_ds, num_workers=0, batch_size=1) From 4c37438330d22c857a57e56bd2f4ab0658544122 Mon Sep 17 00:00:00 2001 From: myron Date: Wed, 30 Nov 2022 14:21:22 -0800 Subject: [PATCH 7/8] checking ci Signed-off-by: myron --- monai/data/dataset.py | 15 ++++++++------- tests/test_integration_fast_train.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 8dc29d539d..638d4e9d27 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -855,13 +855,14 @@ def disable_share_memory_cache(self): Because multiprocessing ListProxy is not supported for the GPU caching, explicitly disable it. """ - if not self._is_dist: - self._cache = list(self._cache) - elif self.runtime_cache: - warnings.warn( - "Unable to disable shared cache in DDP, when runtime_cache==True." - "Please use runtime_cache=False option to explicitly not use the shared cache." - ) + if self.runtime_cache: + if not self._is_dist: + self._cache = list(self._cache) + else: + warnings.warn( + "Unable to disable shared cache in DDP, when runtime_cache==True." + "Please use runtime_cache=False option to explicitly not use the shared cache." + ) def _fill_cache(self, indices=None) -> List: """ diff --git a/tests/test_integration_fast_train.py b/tests/test_integration_fast_train.py index bb50ddf7b6..b544958846 100644 --- a/tests/test_integration_fast_train.py +++ b/tests/test_integration_fast_train.py @@ -144,7 +144,7 @@ def test_train_timing(self): # set CacheDataset, ThreadDataLoader and DiceCE loss for MONAI fast training train_ds = CacheDataset(data=train_files, transform=train_transforms, cache_rate=1.0, num_workers=8) - val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=True) + val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0) # disable multi-workers because `ThreadDataLoader` works with multi-threads train_loader = ThreadDataLoader(train_ds, num_workers=0, batch_size=4, shuffle=True) val_loader = ThreadDataLoader(val_ds, num_workers=0, batch_size=1) From 631cd56d7ad0cce69716167bced641ad3f7bc938 Mon Sep 17 00:00:00 2001 From: myron Date: Wed, 30 Nov 2022 22:53:47 -0800 Subject: [PATCH 8/8] per comments Signed-off-by: myron --- tests/test_integration_fast_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration_fast_train.py b/tests/test_integration_fast_train.py index b544958846..bb50ddf7b6 100644 --- a/tests/test_integration_fast_train.py +++ b/tests/test_integration_fast_train.py @@ -144,7 +144,7 @@ def test_train_timing(self): # set CacheDataset, ThreadDataLoader and DiceCE loss for MONAI fast training train_ds = CacheDataset(data=train_files, transform=train_transforms, cache_rate=1.0, num_workers=8) - val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0) + val_ds = CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0, runtime_cache=True) # disable multi-workers because `ThreadDataLoader` works with multi-threads train_loader = ThreadDataLoader(train_ds, num_workers=0, batch_size=4, shuffle=True) val_loader = ThreadDataLoader(val_ds, num_workers=0, batch_size=1)