TileDB-Inc · gsakkis · Aug 22, 2022 · Aug 19, 2022 · Aug 19, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,14 +5,13 @@ on: [push]
 jobs:
   build:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 60
     strategy:
       fail-fast: false
       matrix:
         ml-deps:
-          - "torch==1.10.2+cpu tensorflow-cpu==2.6.3"
-          - "torch==1.11.0+cpu tensorflow-cpu==2.7.1"
-          - "torch==1.12.0+cpu tensorflow-cpu==2.8.1"
+          - "torch==1.12.0+cpu torchvision==0.13.0+cpu torchdata==0.4.0 tensorflow-cpu==2.7.1"
+          - "torch==1.12.1+cpu torchvision==0.13.1+cpu torchdata==0.4.1 tensorflow-cpu==2.8.1"
 
     env:
       run_coverage: ${{ github.ref == 'refs/heads/master' }}
@@ -35,7 +34,7 @@ jobs:
       run: |
         pip install --upgrade pip
         pip install -f https://download.pytorch.org/whl/torch_stable.html protobuf==3.* ${{ matrix.ml-deps }}
-        pip install pytest-mock pytest-cov torchdata scikit-learn==1.0.2
+        pip install pytest-mock pytest-cov scikit-learn==1.0.2
         pip install -e .[cloud]
 
     - name: Run mypy
@@ -47,7 +46,7 @@ jobs:
 
     - name: Run notebook examples
       run: |
-        pip install pytest-xdist nbmake matplotlib torchvision idx2numpy
+        pip install pytest-xdist nbmake matplotlib idx2numpy
         pytest --disable-warnings --nbmake examples/{models,readers}
         # Run tiledb-cloud in parallel
         if [[ "${{ secrets.TILEDB_API_TOKEN }}" != "" ]]; then

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 import setuptools
 
 tensorflow = ["tensorflow>=2.6"]
-pytorch = ["torch>=1.10", "torchdata"]
+pytorch = ["torch>=1.12", "torchdata"]
 sklearn = ["scikit-learn>=1.0"]
 cloud = ["tiledb-cloud"]
 full = sorted({"torchvision", *tensorflow, *pytorch, *sklearn, *cloud})

diff --git a/tests/readers/test_pytorch.py b/tests/readers/test_pytorch.py
@@ -36,6 +36,11 @@ def test_dataloader(
                     validate_tensor_generator(
                         dataloader, x_spec, y_spec, batch_size, supports_csr=True
                     )
+                    # ensure the dataloader can be iterated again
+                    n1 = sum(1 for _ in dataloader)
+                    assert n1 != 0
+                    n2 = sum(1 for _ in dataloader)
+                    assert n1 == n2
 
     @parametrize_for_dataset(
         non_key_dim_dtype=non_key_dim_dtype,

diff --git a/tiledb/ml/readers/pytorch.py b/tiledb/ml/readers/pytorch.py
@@ -83,7 +83,7 @@ def PyTorchTileDBDataLoader(
             datapipe, num_workers=num_workers, batch_size=None, collate_fn=_identity
         )
         # create a new datapipe for these rows
-        datapipe = IterableWrapper(iter(row_loader), deepcopy=False)
+        datapipe = DeferredIterableIterDataPipe(iter, row_loader)
         # shuffle the datapipe items
         datapipe = datapipe.shuffle(buffer_size=shuffle_buffer_size)
         # run the shuffling on this process, not on workers
@@ -97,6 +97,16 @@ def PyTorchTileDBDataLoader(
     return DataLoader(datapipe, collate_fn=collate_fn, **kwargs)
 
 
+class DeferredIterableIterDataPipe(IterDataPipe):
+    """Wraps a callable that returns an iterable object to create an IterDataPipe."""
+
+    def __init__(self, func: Callable[..., Iterator[Any]], *args: Any, **kwargs: Any):
+        self._callable = partial(func, *args, **kwargs)
+
+    def __iter__(self) -> Iterator[Any]:
+        return self._callable()
+
+
 def _identity(x: Any) -> Any:
     return x
 
@@ -113,7 +123,7 @@ def _get_unbatched_datapipe(
     of `Tensor`s, depending on `schema.num_fields`), one for each schema.
     """
     schema_dps = [
-        IterableWrapper(_unbatch_tensors(schema, key_range), deepcopy=False)
+        DeferredIterableIterDataPipe(_unbatch_tensors, schema, key_range)
         for schema in schemas
     ]
     dp = schema_dps.pop(0)