From 0b2f499e245d81b45babe9304cc644ecb7fbf306 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Mon, 4 Oct 2021 22:31:06 +0100
Subject: [PATCH 01/11] relax atol for a100

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_affine_transform.py               | 24 ++++++++++----------
 tests/test_global_mutual_information_loss.py |  2 +-
 tests/test_rand_affine.py                    |  2 +-
 tests/test_rand_affine_grid.py               |  2 +-
 tests/test_rand_affined.py                   |  2 +-
 tests/test_rand_elastic_2d.py                |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index 42af58be73..689eb6169b 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -95,7 +95,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
             new_affine = to_norm_affine(affine, src_size, dst_size, align_corners)
             new_affine = new_affine.detach().cpu().numpy()
-            np.testing.assert_allclose(new_affine, expected, atol=1e-4)
+            np.testing.assert_allclose(new_affine, expected, atol=1e-3, rtol=1e-3)
 
     @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES)
     def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners):
@@ -113,7 +113,7 @@ def test_affine_shift(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
 
     def test_affine_shift_1(self):
         affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]])
@@ -121,7 +121,7 @@ def test_affine_shift_1(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
 
     def test_affine_shift_2(self):
         affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]])
@@ -129,28 +129,28 @@ def test_affine_shift_2(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
 
     def test_zoom(self):
         affine = torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]])
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform((3, 2))(image, affine)
         expected = [[[[1, 3], [5, 7], [9, 11]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
 
     def test_zoom_1(self):
         affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform()(image, affine, (1, 4))
         expected = [[[[1, 2, 3, 4]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-3)
 
     def test_zoom_2(self):
         affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0]], dtype=torch.float32)
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform((1, 2))(image, affine)
         expected = [[[[1, 3]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-3)
 
     def test_affine_transform_minimum(self):
         t = np.pi / 3
@@ -169,7 +169,7 @@ def test_affine_transform_minimum(self):
                 ]
             ]
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-3)
 
     def test_affine_transform_2d(self):
         t = np.pi / 3
@@ -188,7 +188,7 @@ def test_affine_transform_2d(self):
                 ]
             ]
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-5)
+        np.testing.assert_allclose(out, expected, atol=1e-3)
 
         if torch.cuda.is_available():
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
@@ -205,7 +205,7 @@ def test_affine_transform_2d(self):
                     ]
                 ]
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-4)
+            np.testing.assert_allclose(out, expected, atol=1e-3)
 
     def test_affine_transform_3d(self):
         t = np.pi / 3
@@ -231,7 +231,7 @@ def test_affine_transform_3d(self):
                 ]
             ],
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-4)
+        np.testing.assert_allclose(out, expected, atol=1e-3)
 
         if torch.cuda.is_available():
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
@@ -255,7 +255,7 @@ def test_affine_transform_3d(self):
                     ]
                 ],
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-4)
+            np.testing.assert_allclose(out, expected, atol=1e-3)
 
     def test_ill_affine_transform(self):
         with self.assertRaises(ValueError):  # image too small
diff --git a/tests/test_global_mutual_information_loss.py b/tests/test_global_mutual_information_loss.py
index a688ea8394..6a658563bc 100644
--- a/tests/test_global_mutual_information_loss.py
+++ b/tests/test_global_mutual_information_loss.py
@@ -114,7 +114,7 @@ class TestGlobalMutualInformationLoss(unittest.TestCase):
     @SkipIfBeforePyTorchVersion((1, 9))
     def test_shape(self, input_param, input_data, expected_val):
         result = GlobalMutualInformationLoss(**input_param).forward(**input_data)
-        np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, rtol=1e-4)
+        np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, rtol=1e-3, atol=1e-3)
 
     def test_ill_shape(self):
         loss = GlobalMutualInformationLoss()
diff --git a/tests/test_rand_affine.py b/tests/test_rand_affine.py
index c88aa538ed..f551bb4b43 100644
--- a/tests/test_rand_affine.py
+++ b/tests/test_rand_affine.py
@@ -141,7 +141,7 @@ def test_rand_affine(self, input_param, input_data, expected_val):
         result = g(**input_data)
         if input_param.get("cache_grid", False):
             self.assertTrue(g._cached_grid is not None)
-        assert_allclose(result, expected_val, rtol=1e-4, atol=1e-4)
+        assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3)
 
     def test_ill_cache(self):
         with self.assertWarns(UserWarning):
diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py
index 4fb534aba1..7f8c5826bc 100644
--- a/tests/test_rand_affine_grid.py
+++ b/tests/test_rand_affine_grid.py
@@ -201,7 +201,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val):
         result = g(**input_data)
         if "device" in input_data:
             self.assertEqual(result.device, input_data[device])
-        assert_allclose(result, expected_val, type_test=False, rtol=1e-4, atol=1e-4)
+        assert_allclose(result, expected_val, type_test=False, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_affined.py b/tests/test_rand_affined.py
index 0109175b16..179cccbd4e 100644
--- a/tests/test_rand_affined.py
+++ b/tests/test_rand_affined.py
@@ -209,7 +209,7 @@ def test_rand_affined(self, input_param, input_data, expected_val):
             if "_transforms" in key:
                 continue
             expected = expected_val[key] if isinstance(expected_val, dict) else expected_val
-            assert_allclose(result, expected, rtol=1e-4, atol=1e-4)
+            assert_allclose(result, expected, rtol=1e-3, atol=1e-3)
 
         g.set_random_state(4)
         res = g(input_data)
diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py
index c414eb1ffd..cb84d49b43 100644
--- a/tests/test_rand_elastic_2d.py
+++ b/tests/test_rand_elastic_2d.py
@@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val):
         g = Rand2DElastic(**input_param)
         g.set_random_state(123)
         result = g(**input_data)
-        assert_allclose(result, expected_val, rtol=1e-4, atol=1e-4)
+        assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":

From b1b87f3edeb3afb0e5522e92c266900ccee07e46 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Tue, 5 Oct 2021 09:30:36 +0100
Subject: [PATCH 02/11] temp tests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 .github/workflows/pythonapp-gpu.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
index edaa2487ce..3326db38e7 100644
--- a/.github/workflows/pythonapp-gpu.yml
+++ b/.github/workflows/pythonapp-gpu.yml
@@ -6,6 +6,7 @@ on:
     branches:
       - main
       - releasing/*
+      - test-3071
   pull_request:
 
 concurrency:
@@ -103,7 +104,7 @@ jobs:
         # fixes preinstalled ruamel_yaml error from the docker image
         rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel*
         python -m pip install ${{ matrix.pytorch }}
-        python -m pip install -r requirements-dev.txt
+        python -m pip install -r requirements-dev.txt --user
         python -m pip list
     - name: Run quick tests (GPU)
       run: |

From b3a727905af3472857b78f67d8e16feef05b90b6 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Tue, 5 Oct 2021 09:43:30 +0100
Subject: [PATCH 03/11] fixes #3071

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 .github/workflows/pythonapp-gpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
index 3326db38e7..c13d932598 100644
--- a/.github/workflows/pythonapp-gpu.yml
+++ b/.github/workflows/pythonapp-gpu.yml
@@ -104,7 +104,7 @@ jobs:
         # fixes preinstalled ruamel_yaml error from the docker image
         rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel*
         python -m pip install ${{ matrix.pytorch }}
-        python -m pip install -r requirements-dev.txt --user
+        python -m pip install -r requirements-dev.txt --ignore-installed ruamel_yaml
         python -m pip list
     - name: Run quick tests (GPU)
       run: |

From 9a3b70e74ea15f9d5b6627c95de086ba6540296d Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Tue, 5 Oct 2021 09:54:53 +0100
Subject: [PATCH 04/11] remove temp tests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 .github/workflows/pythonapp-gpu.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
index c13d932598..edaa2487ce 100644
--- a/.github/workflows/pythonapp-gpu.yml
+++ b/.github/workflows/pythonapp-gpu.yml
@@ -6,7 +6,6 @@ on:
     branches:
       - main
       - releasing/*
-      - test-3071
   pull_request:
 
 concurrency:
@@ -104,7 +103,7 @@ jobs:
         # fixes preinstalled ruamel_yaml error from the docker image
         rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel*
         python -m pip install ${{ matrix.pytorch }}
-        python -m pip install -r requirements-dev.txt --ignore-installed ruamel_yaml
+        python -m pip install -r requirements-dev.txt
         python -m pip list
     - name: Run quick tests (GPU)
       run: |

From a0a0905053e7eb576e02cb37922ce78182ad7034 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Tue, 5 Oct 2021 18:43:55 +0100
Subject: [PATCH 05/11] fixes tests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_affine_grid.py            | 2 +-
 tests/test_affine_transform.py       | 2 +-
 tests/test_create_grid_and_affine.py | 2 +-
 tests/test_lltm.py                   | 4 ++--
 tests/test_rand_affine_grid.py       | 2 +-
 tests/test_rand_elastic_2d.py        | 2 +-
 tests/test_rand_elasticd_2d.py       | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_affine_grid.py b/tests/test_affine_grid.py
index 972cf20a1f..97ab6d4ebb 100644
--- a/tests/test_affine_grid.py
+++ b/tests/test_affine_grid.py
@@ -115,7 +115,7 @@ def test_affine_grid(self, input_param, input_data, expected_val):
         result, _ = g(**input_data)
         if "device" in input_data:
             self.assertEqual(result.device, input_data[device])
-        assert_allclose(result, expected_val, type_test=False, rtol=1e-4, atol=1e-4)
+        assert_allclose(result, expected_val, type_test=False, rtol=5e-2, atol=5e-2)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index 689eb6169b..99df771670 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -205,7 +205,7 @@ def test_affine_transform_2d(self):
                     ]
                 ]
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-3)
+            np.testing.assert_allclose(out, expected, atol=5e-3)
 
     def test_affine_transform_3d(self):
         t = np.pi / 3
diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py
index b53eaa5b9d..b59f53bcc8 100644
--- a/tests/test_create_grid_and_affine.py
+++ b/tests/test_create_grid_and_affine.py
@@ -162,7 +162,7 @@ def test_assert(func, params, expected):
             m = func(*params, device="cuda:0", backend="torch")
         else:
             m = func(*params, backend=b)
-        assert_allclose(m, expected, type_test=False, atol=1e-7)
+        assert_allclose(m, expected, type_test=False, atol=1e-3, rtol=1e-3)
 
 
 class TestCreateAffine(unittest.TestCase):
diff --git a/tests/test_lltm.py b/tests/test_lltm.py
index f1311379bc..b53f0c80f3 100644
--- a/tests/test_lltm.py
+++ b/tests/test_lltm.py
@@ -50,8 +50,8 @@ def test_value_cuda(self, input_param, expected_h, expected_c):
         new_h, new_c = lltm(x, (h, c))
         (new_h.sum() + new_c.sum()).backward()
 
-        torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.0001, atol=1e-04)
-        torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.0001, atol=1e-04)
+        torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.001, atol=0.001)
+        torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.001, atol=0.001)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py
index 7f8c5826bc..18e13f5069 100644
--- a/tests/test_rand_affine_grid.py
+++ b/tests/test_rand_affine_grid.py
@@ -201,7 +201,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val):
         result = g(**input_data)
         if "device" in input_data:
             self.assertEqual(result.device, input_data[device])
-        assert_allclose(result, expected_val, type_test=False, rtol=1e-3, atol=1e-3)
+        assert_allclose(result, expected_val, type_test=False, rtol=1e-2, atol=1e-2)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py
index cb84d49b43..210be1ab25 100644
--- a/tests/test_rand_elastic_2d.py
+++ b/tests/test_rand_elastic_2d.py
@@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val):
         g = Rand2DElastic(**input_param)
         g.set_random_state(123)
         result = g(**input_data)
-        assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3)
+        assert_allclose(result, expected_val, rtol=1e-2, atol=1e-2)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_elasticd_2d.py b/tests/test_rand_elasticd_2d.py
index 84f18120e1..62ee66608b 100644
--- a/tests/test_rand_elasticd_2d.py
+++ b/tests/test_rand_elasticd_2d.py
@@ -164,7 +164,7 @@ def test_rand_2d_elasticd(self, input_param, input_data, expected_val):
         for key in res:
             result = res[key]
             expected = expected_val[key] if isinstance(expected_val, dict) else expected_val
-            assert_allclose(result, expected, rtol=1e-4, atol=1e-4)
+            assert_allclose(result, expected, rtol=5e-3, atol=5e-3)
 
 
 if __name__ == "__main__":

From d74565d003ecc8f4b7465d7e8f64e9bd334ce253 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Tue, 5 Oct 2021 20:46:44 +0100
Subject: [PATCH 06/11] update tests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_affine_transform.py | 2 +-
 tests/test_rand_elastic_2d.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index 99df771670..edc680eaf9 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -255,7 +255,7 @@ def test_affine_transform_3d(self):
                     ]
                 ],
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-3)
+            np.testing.assert_allclose(out, expected, atol=5e-3)
 
     def test_ill_affine_transform(self):
         with self.assertRaises(ValueError):  # image too small
diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py
index 210be1ab25..769d87c940 100644
--- a/tests/test_rand_elastic_2d.py
+++ b/tests/test_rand_elastic_2d.py
@@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val):
         g = Rand2DElastic(**input_param)
         g.set_random_state(123)
         result = g(**input_data)
-        assert_allclose(result, expected_val, rtol=1e-2, atol=1e-2)
+        assert_allclose(result, expected_val, rtol=5e-3, atol=5e-3)
 
 
 if __name__ == "__main__":

From 1c01c2adf2248d9144030394f0d26d3edc1965d1 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 6 Oct 2021 09:47:17 +0100
Subject: [PATCH 07/11] add tf32 tests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_affine_grid.py            |  6 ++++--
 tests/test_affine_transform.py       | 25 ++++++++++++++-----------
 tests/test_create_grid_and_affine.py |  4 ++--
 tests/utils.py                       |  9 +++++++++
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/tests/test_affine_grid.py b/tests/test_affine_grid.py
index 97ab6d4ebb..9bf2bcf90e 100644
--- a/tests/test_affine_grid.py
+++ b/tests/test_affine_grid.py
@@ -16,7 +16,7 @@
 from parameterized import parameterized
 
 from monai.transforms import AffineGrid
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -107,6 +107,8 @@
             ]
         )
 
+_rtol = 5e-2 if is_tf32_env() else 1e-4
+
 
 class TestAffineGrid(unittest.TestCase):
     @parameterized.expand(TESTS)
@@ -115,7 +117,7 @@ def test_affine_grid(self, input_param, input_data, expected_val):
         result, _ = g(**input_data)
         if "device" in input_data:
             self.assertEqual(result.device, input_data[device])
-        assert_allclose(result, expected_val, type_test=False, rtol=5e-2, atol=5e-2)
+        assert_allclose(result, expected_val, type_test=False, rtol=_rtol)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index edc680eaf9..5c2637cec8 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -17,6 +17,9 @@
 
 from monai.networks import normalize_transform, to_norm_affine
 from monai.networks.layers import AffineTransform
+from tests.utils import is_tf32_env
+
+_rtol = 1e-4 if not is_tf32_env() else 5e-3
 
 TEST_NORM_CASES = [
     [(4, 5), True, [[[0.666667, 0, -1], [0, 0.5, -1], [0, 0, 1]]]],
@@ -95,7 +98,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
             new_affine = to_norm_affine(affine, src_size, dst_size, align_corners)
             new_affine = new_affine.detach().cpu().numpy()
-            np.testing.assert_allclose(new_affine, expected, atol=1e-3, rtol=1e-3)
+            np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=1e-3)
 
     @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES)
     def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners):
@@ -113,7 +116,7 @@ def test_affine_shift(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol)
 
     def test_affine_shift_1(self):
         affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]])
@@ -121,7 +124,7 @@ def test_affine_shift_1(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol)
 
     def test_affine_shift_2(self):
         affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]])
@@ -129,14 +132,14 @@ def test_affine_shift_2(self):
         out = AffineTransform()(image, affine)
         out = out.detach().cpu().numpy()
         expected = [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol)
 
     def test_zoom(self):
         affine = torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]])
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform((3, 2))(image, affine)
         expected = [[[[1, 3], [5, 7], [9, 11]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol)
 
     def test_zoom_1(self):
         affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
@@ -150,7 +153,7 @@ def test_zoom_2(self):
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform((1, 2))(image, affine)
         expected = [[[[1, 3]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol)
 
     def test_affine_transform_minimum(self):
         t = np.pi / 3
@@ -169,7 +172,7 @@ def test_affine_transform_minimum(self):
                 ]
             ]
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-3, rtol=_rtol)
 
     def test_affine_transform_2d(self):
         t = np.pi / 3
@@ -188,7 +191,7 @@ def test_affine_transform_2d(self):
                 ]
             ]
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-3, rtol=_rtol)
 
         if torch.cuda.is_available():
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
@@ -205,7 +208,7 @@ def test_affine_transform_2d(self):
                     ]
                 ]
             ]
-            np.testing.assert_allclose(out, expected, atol=5e-3)
+            np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol)
 
     def test_affine_transform_3d(self):
         t = np.pi / 3
@@ -231,7 +234,7 @@ def test_affine_transform_3d(self):
                 ]
             ],
         ]
-        np.testing.assert_allclose(out, expected, atol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol)
 
         if torch.cuda.is_available():
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
@@ -255,7 +258,7 @@ def test_affine_transform_3d(self):
                     ]
                 ],
             ]
-            np.testing.assert_allclose(out, expected, atol=5e-3)
+            np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol)
 
     def test_ill_affine_transform(self):
         with self.assertRaises(ValueError):  # image too small
diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py
index b59f53bcc8..eb0236452b 100644
--- a/tests/test_create_grid_and_affine.py
+++ b/tests/test_create_grid_and_affine.py
@@ -22,7 +22,7 @@
     create_shear,
     create_translate,
 )
-from tests.utils import assert_allclose
+from tests.utils import assert_allclose, is_tf32_env
 
 
 class TestCreateGrid(unittest.TestCase):
@@ -162,7 +162,7 @@ def test_assert(func, params, expected):
             m = func(*params, device="cuda:0", backend="torch")
         else:
             m = func(*params, backend=b)
-        assert_allclose(m, expected, type_test=False, atol=1e-3, rtol=1e-3)
+        assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5)
 
 
 class TestCreateAffine(unittest.TestCase):
diff --git a/tests/utils.py b/tests/utils.py
index b7e32068c3..e9633aaa0a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -104,6 +104,15 @@ def test_is_quick():
     return os.environ.get(quick_test_var, "").lower() == "true"
 
 
+def is_tf32_env():
+    """
+    The environment variable NVIDIA_TF32_OVERRIDE=0 will override any defaults
+    or programmatic configuration of NVIDIA libraries, and consequently,
+    cuBLAS will not accelerate FP32 computations with TF32 tensor cores.
+    """
+    return os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
+
+
 def skip_if_quick(obj):
     """
     Skip the unit tests if environment variable `quick_test_var=true`.

From 22c4b535f187beb9d5afe50fcb81c632e459519d Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 6 Oct 2021 09:59:30 +0100
Subject: [PATCH 08/11] fixes rtol wit tf32

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_affine_transform.py       | 4 ++--
 tests/test_create_grid_and_affine.py | 2 +-
 tests/test_lltm.py                   | 8 +++++---
 tests/test_rand_affine.py            | 6 ++++--
 tests/test_rand_affine_grid.py       | 6 ++++--
 tests/test_rand_affined.py           | 6 ++++--
 tests/test_rand_elastic_2d.py        | 6 ++++--
 tests/test_rand_elasticd_2d.py       | 6 ++++--
 8 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index 5c2637cec8..8e27dca076 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -98,7 +98,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte
             affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32)
             new_affine = to_norm_affine(affine, src_size, dst_size, align_corners)
             new_affine = new_affine.detach().cpu().numpy()
-            np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=1e-3)
+            np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=_rtol)
 
     @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES)
     def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners):
@@ -146,7 +146,7 @@ def test_zoom_1(self):
         image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0"))
         out = AffineTransform()(image, affine, (1, 4))
         expected = [[[[1, 2, 3, 4]]]]
-        np.testing.assert_allclose(out, expected, atol=1e-3)
+        np.testing.assert_allclose(out, expected, atol=_rtol)
 
     def test_zoom_2(self):
         affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0]], dtype=torch.float32)
diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py
index eb0236452b..917f01ff96 100644
--- a/tests/test_create_grid_and_affine.py
+++ b/tests/test_create_grid_and_affine.py
@@ -162,7 +162,7 @@ def test_assert(func, params, expected):
             m = func(*params, device="cuda:0", backend="torch")
         else:
             m = func(*params, backend=b)
-        assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5)
+        assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5, atol=1e-5)
 
 
 class TestCreateAffine(unittest.TestCase):
diff --git a/tests/test_lltm.py b/tests/test_lltm.py
index b53f0c80f3..4186c91246 100644
--- a/tests/test_lltm.py
+++ b/tests/test_lltm.py
@@ -15,7 +15,9 @@
 from parameterized import parameterized
 
 from monai.networks.layers import LLTM
-from tests.utils import SkipIfNoModule
+from tests.utils import SkipIfNoModule, is_tf32_env
+
+_rtol = 0.001 if is_tf32_env() else 0.0001
 
 TEST_CASE_1 = [
     {"input_features": 32, "state_size": 2},
@@ -50,8 +52,8 @@ def test_value_cuda(self, input_param, expected_h, expected_c):
         new_h, new_c = lltm(x, (h, c))
         (new_h.sum() + new_c.sum()).backward()
 
-        torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.001, atol=0.001)
-        torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.001, atol=0.001)
+        torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=_rtol, atol=0.001)
+        torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=_rtol, atol=0.001)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_affine.py b/tests/test_rand_affine.py
index f551bb4b43..96322813c9 100644
--- a/tests/test_rand_affine.py
+++ b/tests/test_rand_affine.py
@@ -17,7 +17,9 @@
 
 from monai.transforms import RandAffine
 from monai.utils.type_conversion import convert_data_type
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
+
+_rtol = 1e-3 if is_tf32_env() else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -141,7 +143,7 @@ def test_rand_affine(self, input_param, input_data, expected_val):
         result = g(**input_data)
         if input_param.get("cache_grid", False):
             self.assertTrue(g._cached_grid is not None)
-        assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3)
+        assert_allclose(result, expected_val, rtol=_rtol, atol=1e-4)
 
     def test_ill_cache(self):
         with self.assertWarns(UserWarning):
diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py
index 18e13f5069..07e94a5ded 100644
--- a/tests/test_rand_affine_grid.py
+++ b/tests/test_rand_affine_grid.py
@@ -16,7 +16,9 @@
 from parameterized import parameterized
 
 from monai.transforms import RandAffineGrid
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
+
+_rtol = 1e-2 if is_tf32_env else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -201,7 +203,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val):
         result = g(**input_data)
         if "device" in input_data:
             self.assertEqual(result.device, input_data[device])
-        assert_allclose(result, expected_val, type_test=False, rtol=1e-2, atol=1e-2)
+        assert_allclose(result, expected_val, type_test=False, rtol=_rtol, atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_affined.py b/tests/test_rand_affined.py
index 179cccbd4e..651452ab07 100644
--- a/tests/test_rand_affined.py
+++ b/tests/test_rand_affined.py
@@ -17,7 +17,9 @@
 
 from monai.transforms import RandAffined
 from monai.utils import GridSampleMode
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
+
+_rtol = 1e-3 if is_tf32_env() else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -209,7 +211,7 @@ def test_rand_affined(self, input_param, input_data, expected_val):
             if "_transforms" in key:
                 continue
             expected = expected_val[key] if isinstance(expected_val, dict) else expected_val
-            assert_allclose(result, expected, rtol=1e-3, atol=1e-3)
+            assert_allclose(result, expected, rtol=_rtol, atol=1e-3)
 
         g.set_random_state(4)
         res = g(input_data)
diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py
index 769d87c940..22920d0f35 100644
--- a/tests/test_rand_elastic_2d.py
+++ b/tests/test_rand_elastic_2d.py
@@ -16,7 +16,9 @@
 from parameterized import parameterized
 
 from monai.transforms import Rand2DElastic
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
+
+_rtol = 5e-3 if is_tf32_env() else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -110,7 +112,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val):
         g = Rand2DElastic(**input_param)
         g.set_random_state(123)
         result = g(**input_data)
-        assert_allclose(result, expected_val, rtol=5e-3, atol=5e-3)
+        assert_allclose(result, expected_val, rtol=_rtol, atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rand_elasticd_2d.py b/tests/test_rand_elasticd_2d.py
index 62ee66608b..77e6489d50 100644
--- a/tests/test_rand_elasticd_2d.py
+++ b/tests/test_rand_elasticd_2d.py
@@ -16,7 +16,9 @@
 from parameterized import parameterized
 
 from monai.transforms import Rand2DElasticd
-from tests.utils import TEST_NDARRAYS, assert_allclose
+from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
+
+_rtol = 5e-3 if is_tf32_env() else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:
@@ -164,7 +166,7 @@ def test_rand_2d_elasticd(self, input_param, input_data, expected_val):
         for key in res:
             result = res[key]
             expected = expected_val[key] if isinstance(expected_val, dict) else expected_val
-            assert_allclose(result, expected, rtol=5e-3, atol=5e-3)
+            assert_allclose(result, expected, rtol=_rtol, atol=5e-3)
 
 
 if __name__ == "__main__":

From 29dd41ce7011a1e4b8842604f4c0ee1b8fa809c4 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 6 Oct 2021 11:11:02 +0100
Subject: [PATCH 09/11] detect tf32

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 monai/__init__.py |  2 +-
 tests/__init__.py | 16 ++++++++++++++++
 tests/utils.py    | 10 +++++-----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/monai/__init__.py b/monai/__init__.py
index 7ab30bcae7..5043208b9c 100644
--- a/monai/__init__.py
+++ b/monai/__init__.py
@@ -26,7 +26,7 @@
 
 __basedir__ = os.path.dirname(__file__)
 
-if not (sys.version_info.major == PY_REQUIRED_MAJOR and sys.version_info.minor >= PY_REQUIRED_MINOR):
+if sys.version_info.major != PY_REQUIRED_MAJOR or sys.version_info.minor < PY_REQUIRED_MINOR:
     raise RuntimeError(
         "MONAI requires Python {}.{} or higher. But the current Python is: {}".format(
             PY_REQUIRED_MAJOR, PY_REQUIRED_MINOR, sys.version
diff --git a/tests/__init__.py b/tests/__init__.py
index 5093d1f72d..215e677d45 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -9,10 +9,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import sys
 import unittest
 import warnings
 
+import torch
+
 
 def _enter_pr_4800(self):
     """
@@ -35,3 +38,16 @@ def _enter_pr_4800(self):
     unittest.case._AssertWarnsContext.__enter__ = _enter_pr_4800  # type: ignore
 except AttributeError:
     pass
+
+
+_tf32_enabled: bool = False
+if torch.cuda.is_available() and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0":
+    try:
+        # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result
+        g_gpu = torch.Generator(device="cuda")
+        g_gpu.manual_seed(2147483647)
+        a_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu)
+        b_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu)
+        _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01  # 0.1713
+    except BaseException:
+        pass
diff --git a/tests/utils.py b/tests/utils.py
index e9633aaa0a..664f47419a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -25,7 +25,7 @@
 from io import BytesIO
 from subprocess import PIPE, Popen
 from typing import Callable, Optional, Tuple
-from urllib.error import ContentTooShortError, HTTPError, URLError
+from urllib.error import HTTPError, URLError
 
 import numpy as np
 import torch
@@ -39,6 +39,7 @@
 from monai.utils.misc import is_module_ver_at_least
 from monai.utils.module import version_leq
 from monai.utils.type_conversion import convert_data_type
+from tests import _tf32_enabled
 
 nib, _ = optional_import("nibabel")
 
@@ -94,10 +95,9 @@ def assert_allclose(
 
 def test_pretrained_networks(network, input_param, device):
     try:
-        net = network(**input_param).to(device)
-    except (URLError, HTTPError, ContentTooShortError) as e:
+        return network(**input_param).to(device)
+    except (URLError, HTTPError) as e:
         raise unittest.SkipTest(e) from e
-    return net
 
 
 def test_is_quick():
@@ -110,7 +110,7 @@ def is_tf32_env():
     or programmatic configuration of NVIDIA libraries, and consequently,
     cuBLAS will not accelerate FP32 computations with TF32 tensor cores.
     """
-    return os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
+    return _tf32_enabled
 
 
 def skip_if_quick(obj):

From 15a673dc876be25892fb670c0fe90ed309ebaaff Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 6 Oct 2021 06:24:19 -0400
Subject: [PATCH 10/11] update flag

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/__init__.py                    | 7 ++++++-
 tests/test_affine_transform.py       | 4 ++--
 tests/test_create_grid_and_affine.py | 2 +-
 tests/test_rand_affine_grid.py       | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 215e677d45..9577ff6370 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -41,7 +41,11 @@ def _enter_pr_4800(self):
 
 
 _tf32_enabled: bool = False
-if torch.cuda.is_available() and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0":
+if (
+    torch.cuda.is_available()
+    and f"{torch.version.cuda}".startswith("11")
+    and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
+):
     try:
         # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result
         g_gpu = torch.Generator(device="cuda")
@@ -51,3 +55,4 @@ def _enter_pr_4800(self):
         _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01  # 0.1713
     except BaseException:
         pass
+print(f"tf32 enabled: {_tf32_enabled}")
diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py
index 8e27dca076..ef39c297ce 100644
--- a/tests/test_affine_transform.py
+++ b/tests/test_affine_transform.py
@@ -208,7 +208,7 @@ def test_affine_transform_2d(self):
                     ]
                 ]
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol)
+            np.testing.assert_allclose(out, expected, atol=5e-3)
 
     def test_affine_transform_3d(self):
         t = np.pi / 3
@@ -258,7 +258,7 @@ def test_affine_transform_3d(self):
                     ]
                 ],
             ]
-            np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol)
+            np.testing.assert_allclose(out, expected, atol=5e-3)
 
     def test_ill_affine_transform(self):
         with self.assertRaises(ValueError):  # image too small
diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py
index 917f01ff96..cd8d75f63e 100644
--- a/tests/test_create_grid_and_affine.py
+++ b/tests/test_create_grid_and_affine.py
@@ -162,7 +162,7 @@ def test_assert(func, params, expected):
             m = func(*params, device="cuda:0", backend="torch")
         else:
             m = func(*params, backend=b)
-        assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5, atol=1e-5)
+        assert_allclose(m, expected, type_test=False, rtol=1e-2 if is_tf32_env() else 1e-5, atol=1e-5)
 
 
 class TestCreateAffine(unittest.TestCase):
diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py
index 07e94a5ded..ade615cd65 100644
--- a/tests/test_rand_affine_grid.py
+++ b/tests/test_rand_affine_grid.py
@@ -18,7 +18,7 @@
 from monai.transforms import RandAffineGrid
 from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env
 
-_rtol = 1e-2 if is_tf32_env else 1e-4
+_rtol = 1e-1 if is_tf32_env else 1e-4
 
 TESTS = []
 for p in TEST_NDARRAYS:

From 2ab64d3d3e87ca45ace2fde2c9617a4b4f9efce1 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 6 Oct 2021 07:18:10 -0400
Subject: [PATCH 11/11] fixes testing util

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/__init__.py | 21 ---------------------
 tests/utils.py    | 21 ++++++++++++++++++++-
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 9577ff6370..5093d1f72d 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -9,13 +9,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import sys
 import unittest
 import warnings
 
-import torch
-
 
 def _enter_pr_4800(self):
     """
@@ -38,21 +35,3 @@ def _enter_pr_4800(self):
     unittest.case._AssertWarnsContext.__enter__ = _enter_pr_4800  # type: ignore
 except AttributeError:
     pass
-
-
-_tf32_enabled: bool = False
-if (
-    torch.cuda.is_available()
-    and f"{torch.version.cuda}".startswith("11")
-    and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
-):
-    try:
-        # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result
-        g_gpu = torch.Generator(device="cuda")
-        g_gpu.manual_seed(2147483647)
-        a_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu)
-        b_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu)
-        _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01  # 0.1713
-    except BaseException:
-        pass
-print(f"tf32 enabled: {_tf32_enabled}")
diff --git a/tests/utils.py b/tests/utils.py
index 664f47419a..833d5d2cc0 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -39,11 +39,11 @@
 from monai.utils.misc import is_module_ver_at_least
 from monai.utils.module import version_leq
 from monai.utils.type_conversion import convert_data_type
-from tests import _tf32_enabled
 
 nib, _ = optional_import("nibabel")
 
 quick_test_var = "QUICKTEST"
+_tf32_enabled = None
 
 
 def clone(data: NdarrayTensor) -> NdarrayTensor:
@@ -110,6 +110,25 @@ def is_tf32_env():
     or programmatic configuration of NVIDIA libraries, and consequently,
     cuBLAS will not accelerate FP32 computations with TF32 tensor cores.
     """
+    global _tf32_enabled
+    if _tf32_enabled is None:
+        _tf32_enabled = False
+        if (
+            torch.cuda.is_available()
+            and f"{torch.version.cuda}".startswith("11")
+            and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0"
+            and torch.cuda.device_count() > 0
+        ):
+            try:
+                # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result
+                g_gpu = torch.Generator(device="cuda")
+                g_gpu.manual_seed(2147483647)
+                a_full = torch.randn(1024, 1024, dtype=torch.double, device="cuda", generator=g_gpu)
+                b_full = torch.randn(1024, 1024, dtype=torch.double, device="cuda", generator=g_gpu)
+                _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.001  # 0.1713
+            except BaseException:
+                pass
+        print(f"tf32 enabled: {_tf32_enabled}")
     return _tf32_enabled