From 5ae6732579042253c95e7971fd19393effc336bd Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Thu, 2 Dec 2021 11:36:21 +0800 Subject: [PATCH 01/11] [DLMED] add dataframe Signed-off-by: Nic Ma --- monai/data/dataset.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index f1b481d598..1e5d605b27 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -37,10 +37,12 @@ if TYPE_CHECKING: from tqdm import tqdm + from pandas import DataFrame has_tqdm = True else: tqdm, has_tqdm = optional_import("tqdm", "4.47.0", min_version, "tqdm") + DataFrame, _ = optional_import("pandas", name="DataFrame") lmdb, _ = optional_import("lmdb") pd, _ = optional_import("pandas") @@ -1224,6 +1226,7 @@ class CSVDataset(Dataset): Args: filename: the filename of expected CSV file to load. if providing a list of filenames, it will load all the files and join tables. + dataframes: if proving `dataframe` directly, skip loading from filename. row_indices: indices of the expected rows to load. it should be a list, every item can be a int number or a range `[start, end)` for the indices. for example: `row_indices=[[0, 100], 200, 201, 202, 300]`. if None, @@ -1253,7 +1256,8 @@ class CSVDataset(Dataset): def __init__( self, - filename: Union[str, Sequence[str]], + filename: Optional[Union[str, Sequence[str]]] = None, + dataframe: Optional[Union[DataFrame, Sequence[DataFrame]]] = None, row_indices: Optional[Sequence[Union[int, str]]] = None, col_names: Optional[Sequence[str]] = None, col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, @@ -1262,7 +1266,8 @@ def __init__( **kwargs, ): files = ensure_tuple(filename) - dfs = [pd.read_csv(f) for f in files] + dataframe = ensure_tuple(dataframe) + dfs = [pd.read_csv(f) for f in files] if any([i is None for i in dataframe]) else dataframe data = convert_tables_to_dicts( dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs ) From 43ecee8fa2deaa2c19d4232c144b406e8e86e589 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Fri, 3 Dec 2021 11:55:28 +0800 Subject: [PATCH 02/11] [DLMED] enhance CSV iterable dataset Signed-off-by: Nic Ma --- monai/data/iterable_dataset.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index d1365fa220..c7c623ee55 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -149,6 +149,7 @@ class CSVIterableDataset(IterableDataset): Args: filename: the filename of CSV file to load. it can be a str, URL, path object or file-like object. if providing a list of filenames, it will load all the files and join tables. + iter: if proving `iter` for stream input directly, skip loading from filename. chunksize: rows of a chunk when loading iterable data from CSV files, default to 1000. more details: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html. buffer_size: size of the buffer to store the loaded chunks, if None, set to `2 x chunksize`. @@ -181,7 +182,8 @@ class CSVIterableDataset(IterableDataset): def __init__( self, - filename: Union[str, Sequence[str]], + filename: Optional[Union[str, Sequence[str]]] = None, + iter: Optional[Union[Iterable, Sequence[Iterable]]] = None, chunksize: int = 1000, buffer_size: Optional[int] = None, col_names: Optional[Sequence[str]] = None, @@ -192,7 +194,9 @@ def __init__( seed: int = 0, **kwargs, ): - self.files = ensure_tuple(filename) + self.filename = filename + self.iter = iter + self.iters = self.reset(filename=filename, iter=iter) self.chunksize = chunksize self.buffer_size = 2 * chunksize if buffer_size is None else buffer_size self.col_names = col_names @@ -201,14 +205,18 @@ def __init__( self.shuffle = shuffle self.seed = seed self.kwargs = kwargs - self.iters = self.reset() super().__init__(data=None, transform=transform) # type: ignore - def reset(self, filename: Optional[Union[str, Sequence[str]]] = None): - if filename is not None: - # update files if necessary - self.files = ensure_tuple(filename) - self.iters = [pd.read_csv(f, chunksize=self.chunksize) for f in self.files] + def reset( + self, + filename: Optional[Union[str, Sequence[str]]] = None, + iter: Optional[Union[Iterable, Sequence[Iterable]]] = None, + ): + files = ensure_tuple(self.filename if filename is None else filename) + self.iters = ensure_tuple(self.iter if iter is None else iter) + # if None in the iters, load from files + if any([i is None for i in self.iters]): + self.iters = [pd.read_csv(f, chunksize=self.chunksize) for f in files] return self.iters def _flattened(self): From 87f75fa5a0b0b224beac24a56b833256432bbe7b Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Fri, 3 Dec 2021 16:20:16 +0800 Subject: [PATCH 03/11] [DLMED] add unit tests Signed-off-by: Nic Ma --- monai/data/dataset.py | 9 +++-- monai/data/iterable_dataset.py | 5 ++- tests/test_csv_dataset.py | 55 ++++++++++++++++++++++---- tests/test_csv_iterable_dataset.py | 62 ++++++++++++++++++++++++++++-- 4 files changed, 115 insertions(+), 16 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 1e5d605b27..748bff3a37 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -36,8 +36,8 @@ from monai.utils.misc import first if TYPE_CHECKING: - from tqdm import tqdm from pandas import DataFrame + from tqdm import tqdm has_tqdm = True else: @@ -1266,8 +1266,11 @@ def __init__( **kwargs, ): files = ensure_tuple(filename) - dataframe = ensure_tuple(dataframe) - dfs = [pd.read_csv(f) for f in files] if any([i is None for i in dataframe]) else dataframe + dfs = (dataframe,) if not isinstance(dataframe, (tuple, list)) else dataframe + # if None in the dataframes, load from files + if any([i is None for i in dfs]): + dfs = [pd.read_csv(f) for f in files] + data = convert_tables_to_dicts( dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs ) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index c7c623ee55..8dc0b55aee 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -196,7 +196,6 @@ def __init__( ): self.filename = filename self.iter = iter - self.iters = self.reset(filename=filename, iter=iter) self.chunksize = chunksize self.buffer_size = 2 * chunksize if buffer_size is None else buffer_size self.col_names = col_names @@ -205,6 +204,7 @@ def __init__( self.shuffle = shuffle self.seed = seed self.kwargs = kwargs + self.iters = self.reset() super().__init__(data=None, transform=transform) # type: ignore def reset( @@ -213,7 +213,8 @@ def reset( iter: Optional[Union[Iterable, Sequence[Iterable]]] = None, ): files = ensure_tuple(self.filename if filename is None else filename) - self.iters = ensure_tuple(self.iter if iter is None else iter) + iter = self.iter if iter is None else iter + self.iters = (iter,) if not isinstance(iter, (tuple, list)) else iter # if None in the iters, load from files if any([i is None for i in self.iters]): self.iters = [pd.read_csv(f, chunksize=self.chunksize) for f in files] diff --git a/tests/test_csv_dataset.py b/tests/test_csv_dataset.py index d187f4e64d..7c86f72b73 100644 --- a/tests/test_csv_dataset.py +++ b/tests/test_csv_dataset.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import pandas as pd from monai.data import CSVDataset from monai.transforms import ToNumpyd @@ -57,6 +58,7 @@ def prepare_csv_file(data, filepath): filepath1 = os.path.join(tempdir, "test_data1.csv") filepath2 = os.path.join(tempdir, "test_data2.csv") filepath3 = os.path.join(tempdir, "test_data3.csv") + filepaths = [filepath1, filepath2, filepath3] prepare_csv_file(test_data1, filepath1) prepare_csv_file(test_data2, filepath2) prepare_csv_file(test_data3, filepath3) @@ -76,7 +78,7 @@ def prepare_csv_file(data, filepath): ) # test multiple CSV files, join tables with kwargs - dataset = CSVDataset([filepath1, filepath2, filepath3], on="subject_id") + dataset = CSVDataset(filepaths, on="subject_id") self.assertDictEqual( {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in dataset[3].items()}, { @@ -102,7 +104,7 @@ def prepare_csv_file(data, filepath): # test selected rows and columns dataset = CSVDataset( - filename=[filepath1, filepath2, filepath3], + filename=filepaths, row_indices=[[0, 2], 3], # load row: 0, 1, 3 col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], ) @@ -120,7 +122,7 @@ def prepare_csv_file(data, filepath): # test group columns dataset = CSVDataset( - filename=[filepath1, filepath2, filepath3], + filename=filepaths, row_indices=[1, 3], # load row: 1, 3 col_names=["subject_id", "image", *[f"ehr_{i}" for i in range(11)], "meta_0", "meta_1", "meta_2"], col_groups={"ehr": [f"ehr_{i}" for i in range(11)], "meta12": ["meta_1", "meta_2"]}, @@ -133,9 +135,7 @@ def prepare_csv_file(data, filepath): # test transform dataset = CSVDataset( - filename=[filepath1, filepath2, filepath3], - col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, - transform=ToNumpyd(keys="ehr"), + filename=filepaths, col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, transform=ToNumpyd(keys="ehr") ) self.assertEqual(len(dataset), 5) expected = [ @@ -151,7 +151,7 @@ def prepare_csv_file(data, filepath): # test default values and dtype dataset = CSVDataset( - filename=[filepath1, filepath2, filepath3], + filename=filepaths, col_names=["subject_id", "image", "ehr_1", "ehr_9", "meta_1"], col_types={"image": {"type": str, "default": "No image"}, "ehr_1": {"type": int, "default": 0}}, how="outer", # generate NaN values in this merge mode @@ -161,6 +161,47 @@ def prepare_csv_file(data, filepath): self.assertEqual(type(dataset[-1]["ehr_1"]), int) np.testing.assert_allclose(dataset[-1]["ehr_9"], 3.3537, rtol=1e-2) + # test pre-loaded DataFrame + df = pd.read_csv(filepath1) + dataset = CSVDataset(dataframe=df) + self.assertDictEqual( + {k: round(v, 4) if not isinstance(v, str) else v for k, v in dataset[2].items()}, + { + "subject_id": "s000002", + "label": 4, + "image": "./imgs/s000002.png", + "ehr_0": 3.7725, + "ehr_1": 4.2118, + "ehr_2": 4.6353, + }, + ) + + # test pre-loaded multiple DataFrames, join tables with kwargs + dfs = [pd.read_csv(i) for i in filepaths] + dataset = CSVDataset(dataframe=dfs, on="subject_id") + self.assertDictEqual( + {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in dataset[3].items()}, + { + "subject_id": "s000003", + "label": 1, + "image": "./imgs/s000003.png", + "ehr_0": 3.3333, + "ehr_1": 3.2353, + "ehr_2": 3.4000, + "ehr_3": 3.1647, + "ehr_4": 3.0863, + "ehr_5": 3.7255, + "ehr_6": 3.6980, + "ehr_7": 3.6980, + "ehr_8": 3.7020, + "ehr_9": 3.3098, + "ehr_10": 3.7294, + "meta_0": False, + "meta_1": False, + "meta_2": True, + }, + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_csv_iterable_dataset.py b/tests/test_csv_iterable_dataset.py index fae8e0ba8d..ea3c1e301b 100644 --- a/tests/test_csv_iterable_dataset.py +++ b/tests/test_csv_iterable_dataset.py @@ -15,6 +15,7 @@ import unittest import numpy as np +import pandas as pd from monai.data import CSVIterableDataset, DataLoader from monai.transforms import ToNumpyd @@ -58,6 +59,7 @@ def prepare_csv_file(data, filepath): filepath1 = os.path.join(tempdir, "test_data1.csv") filepath2 = os.path.join(tempdir, "test_data2.csv") filepath3 = os.path.join(tempdir, "test_data3.csv") + filepaths = [filepath1, filepath2, filepath3] prepare_csv_file(test_data1, filepath1) prepare_csv_file(test_data2, filepath2) prepare_csv_file(test_data3, filepath3) @@ -92,7 +94,7 @@ def prepare_csv_file(data, filepath): self.assertEqual(count, 5) # test multiple CSV files, join tables with kwargs - dataset = CSVIterableDataset([filepath1, filepath2, filepath3], on="subject_id", shuffle=False) + dataset = CSVIterableDataset(filepaths, on="subject_id", shuffle=False) count = 0 for item in dataset: count += 1 @@ -123,7 +125,7 @@ def prepare_csv_file(data, filepath): # test selected columns and chunk size dataset = CSVIterableDataset( - filename=[filepath1, filepath2, filepath3], + filename=filepaths, chunksize=2, col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], shuffle=False, @@ -146,7 +148,7 @@ def prepare_csv_file(data, filepath): # test group columns dataset = CSVIterableDataset( - filename=[filepath1, filepath2, filepath3], + filename=filepaths, col_names=["subject_id", "image", *[f"ehr_{i}" for i in range(11)], "meta_0", "meta_1", "meta_2"], col_groups={"ehr": [f"ehr_{i}" for i in range(11)], "meta12": ["meta_1", "meta_2"]}, shuffle=False, @@ -166,7 +168,7 @@ def prepare_csv_file(data, filepath): dataset = CSVIterableDataset( chunksize=2, buffer_size=4, - filename=[filepath1, filepath2, filepath3], + filename=filepaths, col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, transform=ToNumpyd(keys="ehr"), shuffle=True, @@ -201,6 +203,58 @@ def prepare_csv_file(data, filepath): self.assertListEqual(item["image"], ["./imgs/s000002.png"]) self.assertEqual(count, 3) + # test iterable stream + iters = pd.read_csv(filepath1, chunksize=1000) + dataset = CSVIterableDataset(iter=iters, shuffle=False) + count = 0 + for item in dataset: + count += 1 + if count == 3: + self.assertDictEqual( + {k: round(v, 4) if not isinstance(v, str) else v for k, v in item.items()}, + { + "subject_id": "s000002", + "label": 4, + "image": "./imgs/s000002.png", + "ehr_0": 3.7725, + "ehr_1": 4.2118, + "ehr_2": 4.6353, + }, + ) + break + self.assertEqual(count, 3) + + # test multiple iterable streams, join tables with kwargs + iters = [pd.read_csv(i, chunksize=1000) for i in filepaths] + dataset = CSVIterableDataset(filepaths, on="subject_id", shuffle=False) + count = 0 + for item in dataset: + count += 1 + if count == 4: + self.assertDictEqual( + {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in item.items()}, + { + "subject_id": "s000003", + "label": 1, + "image": "./imgs/s000003.png", + "ehr_0": 3.3333, + "ehr_1": 3.2353, + "ehr_2": 3.4000, + "ehr_3": 3.1647, + "ehr_4": 3.0863, + "ehr_5": 3.7255, + "ehr_6": 3.6980, + "ehr_7": 3.6980, + "ehr_8": 3.7020, + "ehr_9": 3.3098, + "ehr_10": 3.7294, + "meta_0": False, + "meta_1": False, + "meta_2": True, + }, + ) + self.assertEqual(count, 5) + if __name__ == "__main__": unittest.main() From 7163d212813ba0e45556b4773b16faca8fdd5a19 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Fri, 3 Dec 2021 16:45:48 +0800 Subject: [PATCH 04/11] [DLMED] fix typehints Signed-off-by: Nic Ma --- monai/data/dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 748bff3a37..bc0078fcd6 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -36,13 +36,11 @@ from monai.utils.misc import first if TYPE_CHECKING: - from pandas import DataFrame from tqdm import tqdm has_tqdm = True else: tqdm, has_tqdm = optional_import("tqdm", "4.47.0", min_version, "tqdm") - DataFrame, _ = optional_import("pandas", name="DataFrame") lmdb, _ = optional_import("lmdb") pd, _ = optional_import("pandas") @@ -1257,7 +1255,7 @@ class CSVDataset(Dataset): def __init__( self, filename: Optional[Union[str, Sequence[str]]] = None, - dataframe: Optional[Union[DataFrame, Sequence[DataFrame]]] = None, + dataframe=None, row_indices: Optional[Sequence[Union[int, str]]] = None, col_names: Optional[Sequence[str]] = None, col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, From 0a3679802d15e5928989363577ea5c4dfc088eda Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Fri, 3 Dec 2021 17:03:24 +0800 Subject: [PATCH 05/11] [DLMED] add comment Signed-off-by: Nic Ma --- monai/data/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index bc0078fcd6..94e789eb63 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -1224,7 +1224,7 @@ class CSVDataset(Dataset): Args: filename: the filename of expected CSV file to load. if providing a list of filenames, it will load all the files and join tables. - dataframes: if proving `dataframe` directly, skip loading from filename. + dataframe: if proving `dataframe` directly, skip loading from filename. row_indices: indices of the expected rows to load. it should be a list, every item can be a int number or a range `[start, end)` for the indices. for example: `row_indices=[[0, 100], 200, 201, 202, 300]`. if None, @@ -1255,7 +1255,7 @@ class CSVDataset(Dataset): def __init__( self, filename: Optional[Union[str, Sequence[str]]] = None, - dataframe=None, + dataframe=None, # if not None, should be `pandas.DataFrame` or sequense of `pandas.DataFrame` row_indices: Optional[Sequence[Union[int, str]]] = None, col_names: Optional[Sequence[str]] = None, col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, From 9f4885d6384d72c746b4f0abc6dc7187649c0a14 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Sat, 4 Dec 2021 09:58:22 +0800 Subject: [PATCH 06/11] [DLMED] update according to comments Signed-off-by: Nic Ma --- monai/data/iterable_dataset.py | 40 +++++++++++++++--------------- tests/test_csv_iterable_dataset.py | 10 ++++---- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index 8dc0b55aee..fa505b7b24 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, Iterable, Optional, Sequence, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union import numpy as np from torch.utils.data import IterableDataset as _TorchIterableDataset @@ -18,7 +18,7 @@ from monai.data.utils import convert_tables_to_dicts from monai.transforms import apply_transform from monai.transforms.transform import Randomizable -from monai.utils import ensure_tuple, optional_import +from monai.utils import deprecated_arg, ensure_tuple, optional_import pd, _ = optional_import("pandas") @@ -147,9 +147,9 @@ class CSVIterableDataset(IterableDataset): ] Args: - filename: the filename of CSV file to load. it can be a str, URL, path object or file-like object. - if providing a list of filenames, it will load all the files and join tables. - iter: if proving `iter` for stream input directly, skip loading from filename. + src: if provided the filename of CSV file to load. it can be a str, URL, path object or file-like object. + also support to provide iter for stream input directly, will skip loading from filename. + if provided a list of filenames or iters, it will join the tables. chunksize: rows of a chunk when loading iterable data from CSV files, default to 1000. more details: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html. buffer_size: size of the buffer to store the loaded chunks, if None, set to `2 x chunksize`. @@ -180,10 +180,10 @@ class CSVIterableDataset(IterableDataset): """ + @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") def __init__( self, - filename: Optional[Union[str, Sequence[str]]] = None, - iter: Optional[Union[Iterable, Sequence[Iterable]]] = None, + src: Union[Union[str, Sequence[str]], Union[Iterable, Sequence[Iterable]]], chunksize: int = 1000, buffer_size: Optional[int] = None, col_names: Optional[Sequence[str]] = None, @@ -194,8 +194,7 @@ def __init__( seed: int = 0, **kwargs, ): - self.filename = filename - self.iter = iter + self.src = src self.chunksize = chunksize self.buffer_size = 2 * chunksize if buffer_size is None else buffer_size self.col_names = col_names @@ -207,17 +206,18 @@ def __init__( self.iters = self.reset() super().__init__(data=None, transform=transform) # type: ignore - def reset( - self, - filename: Optional[Union[str, Sequence[str]]] = None, - iter: Optional[Union[Iterable, Sequence[Iterable]]] = None, - ): - files = ensure_tuple(self.filename if filename is None else filename) - iter = self.iter if iter is None else iter - self.iters = (iter,) if not isinstance(iter, (tuple, list)) else iter - # if None in the iters, load from files - if any([i is None for i in self.iters]): - self.iters = [pd.read_csv(f, chunksize=self.chunksize) for f in files] + @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") + def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, Sequence[Iterable]]]] = None): + src = self.src if src is None else src + srcs = (src,) if not isinstance(src, (tuple, list)) else src + self.iters: List[Iterable] = [] + for i in srcs: + if isinstance(i, str): + self.iters.append(pd.read_csv(i, chunksize=self.chunksize)) + elif isinstance(i, Iterable): + self.iters.append(i) + else: + raise ValueError("`src` must be file path or iterable object.") return self.iters def _flattened(self): diff --git a/tests/test_csv_iterable_dataset.py b/tests/test_csv_iterable_dataset.py index ea3c1e301b..769dd72a20 100644 --- a/tests/test_csv_iterable_dataset.py +++ b/tests/test_csv_iterable_dataset.py @@ -85,7 +85,7 @@ def prepare_csv_file(data, filepath): self.assertEqual(count, 3) # test reset iterables - dataset.reset(filename=filepath3) + dataset.reset(src=filepath3) count = 0 for i, item in enumerate(dataset): count += 1 @@ -125,7 +125,7 @@ def prepare_csv_file(data, filepath): # test selected columns and chunk size dataset = CSVIterableDataset( - filename=filepaths, + src=filepaths, chunksize=2, col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], shuffle=False, @@ -148,7 +148,7 @@ def prepare_csv_file(data, filepath): # test group columns dataset = CSVIterableDataset( - filename=filepaths, + src=filepaths, col_names=["subject_id", "image", *[f"ehr_{i}" for i in range(11)], "meta_0", "meta_1", "meta_2"], col_groups={"ehr": [f"ehr_{i}" for i in range(11)], "meta12": ["meta_1", "meta_2"]}, shuffle=False, @@ -168,7 +168,7 @@ def prepare_csv_file(data, filepath): dataset = CSVIterableDataset( chunksize=2, buffer_size=4, - filename=filepaths, + src=filepaths, col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, transform=ToNumpyd(keys="ehr"), shuffle=True, @@ -205,7 +205,7 @@ def prepare_csv_file(data, filepath): # test iterable stream iters = pd.read_csv(filepath1, chunksize=1000) - dataset = CSVIterableDataset(iter=iters, shuffle=False) + dataset = CSVIterableDataset(src=iters, shuffle=False) count = 0 for item in dataset: count += 1 From 73145f65267074ce7a8b2ce4db21f73e23a78250 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Sat, 4 Dec 2021 14:10:16 +0800 Subject: [PATCH 07/11] [DLMED] update according to comments Signed-off-by: Nic Ma --- monai/data/dataset.py | 30 +++++++++++++++++++----------- monai/data/iterable_dataset.py | 11 +++++++---- tests/test_csv_dataset.py | 12 ++++++------ tests/test_csv_iterable_dataset.py | 5 +---- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 94e789eb63..e84155b926 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -32,7 +32,7 @@ from monai.data.utils import SUPPORTED_PICKLE_MOD, convert_tables_to_dicts, pickle_hashing from monai.transforms import Compose, Randomizable, ThreadUnsafe, Transform, apply_transform -from monai.utils import MAX_SEED, ensure_tuple, get_seed, look_up_option, min_version, optional_import +from monai.utils import MAX_SEED, deprecated_arg, get_seed, look_up_option, min_version, optional_import from monai.utils.misc import first if TYPE_CHECKING: @@ -1222,9 +1222,9 @@ class CSVDataset(Dataset): ] Args: - filename: the filename of expected CSV file to load. if providing a list - of filenames, it will load all the files and join tables. - dataframe: if proving `dataframe` directly, skip loading from filename. + src: if provided the filename of CSV file, it can be a str, URL, path object or file-like object to load. + also support to provide pandas `DataFrame` directly, will skip loading from filename. + if provided a list of filenames or pandas `DataFrame`, it will join the tables. row_indices: indices of the expected rows to load. it should be a list, every item can be a int number or a range `[start, end)` for the indices. for example: `row_indices=[[0, 100], 200, 201, 202, 300]`. if None, @@ -1250,12 +1250,16 @@ class CSVDataset(Dataset): transform: transform to apply on the loaded items of a dictionary data. kwargs: additional arguments for `pandas.merge()` API to join tables. + .. deprecated:: 0.8.0 + ``filename`` is deprecated, use ``src`` instead. + """ + @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") def __init__( self, - filename: Optional[Union[str, Sequence[str]]] = None, - dataframe=None, # if not None, should be `pandas.DataFrame` or sequense of `pandas.DataFrame` + # `src` also can be `pandas.DataFrame` or sequense of `pandas.DataFrame` + src: Optional[Union[str, Sequence[str]]] = None, row_indices: Optional[Sequence[Union[int, str]]] = None, col_names: Optional[Sequence[str]] = None, col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, @@ -1263,11 +1267,15 @@ def __init__( transform: Optional[Callable] = None, **kwargs, ): - files = ensure_tuple(filename) - dfs = (dataframe,) if not isinstance(dataframe, (tuple, list)) else dataframe - # if None in the dataframes, load from files - if any([i is None for i in dfs]): - dfs = [pd.read_csv(f) for f in files] + srcs = (src,) if not isinstance(src, (tuple, list)) else src + dfs: List = [] + for i in srcs: + if isinstance(i, str): + dfs.append(pd.read_csv(i)) + elif isinstance(i, pd.DataFrame): + dfs.append(i) + else: + raise ValueError("`src` must be file path or pandas `DataFrame`.") data = convert_tables_to_dicts( dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index fa505b7b24..185469d76d 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -18,7 +18,7 @@ from monai.data.utils import convert_tables_to_dicts from monai.transforms import apply_transform from monai.transforms.transform import Randomizable -from monai.utils import deprecated_arg, ensure_tuple, optional_import +from monai.utils import deprecated_arg, optional_import pd, _ = optional_import("pandas") @@ -147,7 +147,7 @@ class CSVIterableDataset(IterableDataset): ] Args: - src: if provided the filename of CSV file to load. it can be a str, URL, path object or file-like object. + src: if provided the filename of CSV file, it can be a str, URL, path object or file-like object to load. also support to provide iter for stream input directly, will skip loading from filename. if provided a list of filenames or iters, it will join the tables. chunksize: rows of a chunk when loading iterable data from CSV files, default to 1000. more details: @@ -178,6 +178,9 @@ class CSVIterableDataset(IterableDataset): https://github.com/pytorch/pytorch/blob/v1.10.0/torch/utils/data/distributed.py#L98. kwargs: additional arguments for `pandas.merge()` API to join tables. + .. deprecated:: 0.8.0 + ``filename`` is deprecated, use ``src`` instead. + """ @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") @@ -203,14 +206,14 @@ def __init__( self.shuffle = shuffle self.seed = seed self.kwargs = kwargs - self.iters = self.reset() + self.iters: List[Iterable] = self.reset() super().__init__(data=None, transform=transform) # type: ignore @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, Sequence[Iterable]]]] = None): src = self.src if src is None else src srcs = (src,) if not isinstance(src, (tuple, list)) else src - self.iters: List[Iterable] = [] + self.iters = [] for i in srcs: if isinstance(i, str): self.iters.append(pd.read_csv(i, chunksize=self.chunksize)) diff --git a/tests/test_csv_dataset.py b/tests/test_csv_dataset.py index 7c86f72b73..7bc9f59f54 100644 --- a/tests/test_csv_dataset.py +++ b/tests/test_csv_dataset.py @@ -104,7 +104,7 @@ def prepare_csv_file(data, filepath): # test selected rows and columns dataset = CSVDataset( - filename=filepaths, + src=filepaths, row_indices=[[0, 2], 3], # load row: 0, 1, 3 col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], ) @@ -122,7 +122,7 @@ def prepare_csv_file(data, filepath): # test group columns dataset = CSVDataset( - filename=filepaths, + src=filepaths, row_indices=[1, 3], # load row: 1, 3 col_names=["subject_id", "image", *[f"ehr_{i}" for i in range(11)], "meta_0", "meta_1", "meta_2"], col_groups={"ehr": [f"ehr_{i}" for i in range(11)], "meta12": ["meta_1", "meta_2"]}, @@ -135,7 +135,7 @@ def prepare_csv_file(data, filepath): # test transform dataset = CSVDataset( - filename=filepaths, col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, transform=ToNumpyd(keys="ehr") + src=filepaths, col_groups={"ehr": [f"ehr_{i}" for i in range(5)]}, transform=ToNumpyd(keys="ehr") ) self.assertEqual(len(dataset), 5) expected = [ @@ -151,7 +151,7 @@ def prepare_csv_file(data, filepath): # test default values and dtype dataset = CSVDataset( - filename=filepaths, + src=filepaths, col_names=["subject_id", "image", "ehr_1", "ehr_9", "meta_1"], col_types={"image": {"type": str, "default": "No image"}, "ehr_1": {"type": int, "default": 0}}, how="outer", # generate NaN values in this merge mode @@ -163,7 +163,7 @@ def prepare_csv_file(data, filepath): # test pre-loaded DataFrame df = pd.read_csv(filepath1) - dataset = CSVDataset(dataframe=df) + dataset = CSVDataset(src=df) self.assertDictEqual( {k: round(v, 4) if not isinstance(v, str) else v for k, v in dataset[2].items()}, { @@ -178,7 +178,7 @@ def prepare_csv_file(data, filepath): # test pre-loaded multiple DataFrames, join tables with kwargs dfs = [pd.read_csv(i) for i in filepaths] - dataset = CSVDataset(dataframe=dfs, on="subject_id") + dataset = CSVDataset(src=dfs, on="subject_id") self.assertDictEqual( {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in dataset[3].items()}, { diff --git a/tests/test_csv_iterable_dataset.py b/tests/test_csv_iterable_dataset.py index 769dd72a20..9563bc9cf8 100644 --- a/tests/test_csv_iterable_dataset.py +++ b/tests/test_csv_iterable_dataset.py @@ -125,10 +125,7 @@ def prepare_csv_file(data, filepath): # test selected columns and chunk size dataset = CSVIterableDataset( - src=filepaths, - chunksize=2, - col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], - shuffle=False, + src=filepaths, chunksize=2, col_names=["subject_id", "image", "ehr_1", "ehr_7", "meta_1"], shuffle=False ) count = 0 for item in dataset: From 532b1d701d5f5b1e68c5585524c3d5d77131e507 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Mon, 6 Dec 2021 19:45:55 +0800 Subject: [PATCH 08/11] [DLMED] update according to comments Signed-off-by: Nic Ma --- monai/data/dataset.py | 3 +-- tests/test_csv_dataset.py | 26 ++++---------------------- tests/test_csv_iterable_dataset.py | 26 ++++---------------------- 3 files changed, 9 insertions(+), 46 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index e84155b926..f0416a6bdb 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -1258,8 +1258,7 @@ class CSVDataset(Dataset): @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") def __init__( self, - # `src` also can be `pandas.DataFrame` or sequense of `pandas.DataFrame` - src: Optional[Union[str, Sequence[str]]] = None, + src: Optional[Union[str, Sequence[str]]] = None, # also can be `DataFrame` or sequense of `DataFrame` row_indices: Optional[Sequence[Union[int, str]]] = None, col_names: Optional[Sequence[str]] = None, col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, diff --git a/tests/test_csv_dataset.py b/tests/test_csv_dataset.py index 7bc9f59f54..2a38409ba0 100644 --- a/tests/test_csv_dataset.py +++ b/tests/test_csv_dataset.py @@ -179,28 +179,10 @@ def prepare_csv_file(data, filepath): # test pre-loaded multiple DataFrames, join tables with kwargs dfs = [pd.read_csv(i) for i in filepaths] dataset = CSVDataset(src=dfs, on="subject_id") - self.assertDictEqual( - {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in dataset[3].items()}, - { - "subject_id": "s000003", - "label": 1, - "image": "./imgs/s000003.png", - "ehr_0": 3.3333, - "ehr_1": 3.2353, - "ehr_2": 3.4000, - "ehr_3": 3.1647, - "ehr_4": 3.0863, - "ehr_5": 3.7255, - "ehr_6": 3.6980, - "ehr_7": 3.6980, - "ehr_8": 3.7020, - "ehr_9": 3.3098, - "ehr_10": 3.7294, - "meta_0": False, - "meta_1": False, - "meta_2": True, - }, - ) + self.assertEqual(dataset[3]["subject_id"], "s000003") + self.assertEqual(dataset[3]["label"], 1) + self.assertEqual(round(dataset[3]["ehr_0"], 4), 3.3333) + self.assertEqual(dataset[3]["meta_0"], False) if __name__ == "__main__": diff --git a/tests/test_csv_iterable_dataset.py b/tests/test_csv_iterable_dataset.py index 9563bc9cf8..7393109c82 100644 --- a/tests/test_csv_iterable_dataset.py +++ b/tests/test_csv_iterable_dataset.py @@ -228,28 +228,10 @@ def prepare_csv_file(data, filepath): for item in dataset: count += 1 if count == 4: - self.assertDictEqual( - {k: round(v, 4) if not isinstance(v, (str, np.bool_)) else v for k, v in item.items()}, - { - "subject_id": "s000003", - "label": 1, - "image": "./imgs/s000003.png", - "ehr_0": 3.3333, - "ehr_1": 3.2353, - "ehr_2": 3.4000, - "ehr_3": 3.1647, - "ehr_4": 3.0863, - "ehr_5": 3.7255, - "ehr_6": 3.6980, - "ehr_7": 3.6980, - "ehr_8": 3.7020, - "ehr_9": 3.3098, - "ehr_10": 3.7294, - "meta_0": False, - "meta_1": False, - "meta_2": True, - }, - ) + self.assertEqual(item["subject_id"], "s000003") + self.assertEqual(item["label"], 1) + self.assertEqual(round(item["ehr_0"], 4), 3.3333) + self.assertEqual(item["meta_0"], False) self.assertEqual(count, 5) From 6cf2b67b45c1907c69abec7fad850bda8a67c7c3 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Mon, 6 Dec 2021 20:27:52 +0800 Subject: [PATCH 09/11] [DLMED] fix file close issue Signed-off-by: Nic Ma --- monai/data/iterable_dataset.py | 23 +++++++++++++++++++++++ tests/test_csv_iterable_dataset.py | 13 ++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index 185469d76d..6c7c4c7474 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -211,6 +211,17 @@ def __init__( @deprecated_arg(name="filename", new_name="src", since="0.8", msg_suffix="please use `src` instead.") def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, Sequence[Iterable]]]] = None): + """ + Reset the pandas `TextFileReader` iterable object to read data. For more details, please check: + https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html?#iteration. + + Args: + src: if not None and provided the filename of CSV file, it can be a str, URL, path object + or file-like object to load. also support to provide iter for stream input directly, + will skip loading from filename. if provided a list of filenames or iters, it will join the tables. + default to `self.src`. + + """ src = self.src if src is None else src srcs = (src,) if not isinstance(src, (tuple, list)) else src self.iters = [] @@ -223,6 +234,18 @@ def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, S raise ValueError("`src` must be file path or iterable object.") return self.iters + def close(self): + """ + Close the pandas `TextFileReader` iterable objects. + If the input src is file path, TextFileReader was created internally, need to close it. + If the input src is iterable object, depends on users requirements whether to close it in this function. + For more details, please check: + https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html?#iteration. + + """ + for i in self.iters: + i.close() + def _flattened(self): for chunks in zip(*self.iters): yield from convert_tables_to_dicts( diff --git a/tests/test_csv_iterable_dataset.py b/tests/test_csv_iterable_dataset.py index 7393109c82..2d2ea2a9c0 100644 --- a/tests/test_csv_iterable_dataset.py +++ b/tests/test_csv_iterable_dataset.py @@ -83,6 +83,7 @@ def prepare_csv_file(data, filepath): ) break self.assertEqual(count, 3) + dataset.close() # test reset iterables dataset.reset(src=filepath3) @@ -92,6 +93,7 @@ def prepare_csv_file(data, filepath): if i == 4: self.assertEqual(item["meta_0"], False) self.assertEqual(count, 5) + dataset.close() # test multiple CSV files, join tables with kwargs dataset = CSVIterableDataset(filepaths, on="subject_id", shuffle=False) @@ -122,6 +124,7 @@ def prepare_csv_file(data, filepath): }, ) self.assertEqual(count, 5) + dataset.close() # test selected columns and chunk size dataset = CSVIterableDataset( @@ -142,6 +145,7 @@ def prepare_csv_file(data, filepath): }, ) self.assertEqual(count, 5) + dataset.close() # test group columns dataset = CSVIterableDataset( @@ -160,6 +164,7 @@ def prepare_csv_file(data, filepath): ) np.testing.assert_allclose(item["meta12"], [False, True]) self.assertEqual(count, 5) + dataset.close() # test transform dataset = CSVIterableDataset( @@ -184,6 +189,7 @@ def prepare_csv_file(data, filepath): self.assertTrue(isinstance(item["ehr"], np.ndarray)) np.testing.assert_allclose(np.around(item["ehr"], 4), exp) self.assertEqual(count, 5) + dataset.close() # test multiple processes loading dataset = CSVIterableDataset(filepath1, transform=ToNumpyd(keys="label"), shuffle=False) @@ -199,6 +205,7 @@ def prepare_csv_file(data, filepath): np.testing.assert_allclose(item["label"], [4]) self.assertListEqual(item["image"], ["./imgs/s000002.png"]) self.assertEqual(count, 3) + dataset.close() # test iterable stream iters = pd.read_csv(filepath1, chunksize=1000) @@ -220,10 +227,11 @@ def prepare_csv_file(data, filepath): ) break self.assertEqual(count, 3) + dataset.close() # test multiple iterable streams, join tables with kwargs iters = [pd.read_csv(i, chunksize=1000) for i in filepaths] - dataset = CSVIterableDataset(filepaths, on="subject_id", shuffle=False) + dataset = CSVIterableDataset(src=iters, on="subject_id", shuffle=False) count = 0 for item in dataset: count += 1 @@ -233,6 +241,9 @@ def prepare_csv_file(data, filepath): self.assertEqual(round(item["ehr_0"], 4), 3.3333) self.assertEqual(item["meta_0"], False) self.assertEqual(count, 5) + # manually close the pre-loaded iterables instead of `dataset.close()` + for i in iters: + i.close() if __name__ == "__main__": From 1118bc0445ec8a59525cc43f38e161feab783d95 Mon Sep 17 00:00:00 2001 From: Nic Ma Date: Mon, 6 Dec 2021 21:15:44 +0800 Subject: [PATCH 10/11] [DLMED] fix doc Signed-off-by: Nic Ma --- monai/data/iterable_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index 6c7c4c7474..e463c8c6ed 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -217,9 +217,9 @@ def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, S Args: src: if not None and provided the filename of CSV file, it can be a str, URL, path object - or file-like object to load. also support to provide iter for stream input directly, - will skip loading from filename. if provided a list of filenames or iters, it will join the tables. - default to `self.src`. + or file-like object to load. also support to provide iter for stream input directly, + will skip loading from filename. if provided a list of filenames or iters, it will join the tables. + default to `self.src`. """ src = self.src if src is None else src @@ -239,7 +239,7 @@ def close(self): Close the pandas `TextFileReader` iterable objects. If the input src is file path, TextFileReader was created internally, need to close it. If the input src is iterable object, depends on users requirements whether to close it in this function. - For more details, please check: + For more details, please check: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html?#iteration. """ From 2425c0b61281b67f65d6acb254ff6894bf2cc6b1 Mon Sep 17 00:00:00 2001 From: monai-bot Date: Mon, 6 Dec 2021 13:21:01 +0000 Subject: [PATCH 11/11] [MONAI] python code formatting Signed-off-by: monai-bot --- monai/networks/blocks/activation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/monai/networks/blocks/activation.py b/monai/networks/blocks/activation.py index b136eb7f1f..9b58be04e8 100644 --- a/monai/networks/blocks/activation.py +++ b/monai/networks/blocks/activation.py @@ -19,7 +19,6 @@ def monai_mish(x, inplace: bool = False): return torch.nn.functional.mish(x, inplace=inplace) - else: def monai_mish(x, inplace: bool = False): @@ -31,7 +30,6 @@ def monai_mish(x, inplace: bool = False): def monai_swish(x, inplace: bool = False): return torch.nn.functional.silu(x, inplace=inplace) - else: def monai_swish(x, inplace: bool = False):