From 0b2f499e245d81b45babe9304cc644ecb7fbf306 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Mon, 4 Oct 2021 22:31:06 +0100 Subject: [PATCH 01/11] relax atol for a100 Signed-off-by: Wenqi Li --- tests/test_affine_transform.py | 24 ++++++++++---------- tests/test_global_mutual_information_loss.py | 2 +- tests/test_rand_affine.py | 2 +- tests/test_rand_affine_grid.py | 2 +- tests/test_rand_affined.py | 2 +- tests/test_rand_elastic_2d.py | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 42af58be73..689eb6169b 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -95,7 +95,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) new_affine = to_norm_affine(affine, src_size, dst_size, align_corners) new_affine = new_affine.detach().cpu().numpy() - np.testing.assert_allclose(new_affine, expected, atol=1e-4) + np.testing.assert_allclose(new_affine, expected, atol=1e-3, rtol=1e-3) @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES) def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners): @@ -113,7 +113,7 @@ def test_affine_shift(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) def test_affine_shift_1(self): affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]]) @@ -121,7 +121,7 @@ def test_affine_shift_1(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) def test_affine_shift_2(self): affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]]) @@ -129,28 +129,28 @@ def test_affine_shift_2(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) def test_zoom(self): affine = torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]]) image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform((3, 2))(image, affine) expected = [[[[1, 3], [5, 7], [9, 11]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) def test_zoom_1(self): affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform()(image, affine, (1, 4)) expected = [[[[1, 2, 3, 4]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-3) def test_zoom_2(self): affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0]], dtype=torch.float32) image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform((1, 2))(image, affine) expected = [[[[1, 3]]]] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-3) def test_affine_transform_minimum(self): t = np.pi / 3 @@ -169,7 +169,7 @@ def test_affine_transform_minimum(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-3) def test_affine_transform_2d(self): t = np.pi / 3 @@ -188,7 +188,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-5) + np.testing.assert_allclose(out, expected, atol=1e-3) if torch.cuda.is_available(): affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) @@ -205,7 +205,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-4) + np.testing.assert_allclose(out, expected, atol=1e-3) def test_affine_transform_3d(self): t = np.pi / 3 @@ -231,7 +231,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=1e-4) + np.testing.assert_allclose(out, expected, atol=1e-3) if torch.cuda.is_available(): affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) @@ -255,7 +255,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=1e-4) + np.testing.assert_allclose(out, expected, atol=1e-3) def test_ill_affine_transform(self): with self.assertRaises(ValueError): # image too small diff --git a/tests/test_global_mutual_information_loss.py b/tests/test_global_mutual_information_loss.py index a688ea8394..6a658563bc 100644 --- a/tests/test_global_mutual_information_loss.py +++ b/tests/test_global_mutual_information_loss.py @@ -114,7 +114,7 @@ class TestGlobalMutualInformationLoss(unittest.TestCase): @SkipIfBeforePyTorchVersion((1, 9)) def test_shape(self, input_param, input_data, expected_val): result = GlobalMutualInformationLoss(**input_param).forward(**input_data) - np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, rtol=1e-4) + np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, rtol=1e-3, atol=1e-3) def test_ill_shape(self): loss = GlobalMutualInformationLoss() diff --git a/tests/test_rand_affine.py b/tests/test_rand_affine.py index c88aa538ed..f551bb4b43 100644 --- a/tests/test_rand_affine.py +++ b/tests/test_rand_affine.py @@ -141,7 +141,7 @@ def test_rand_affine(self, input_param, input_data, expected_val): result = g(**input_data) if input_param.get("cache_grid", False): self.assertTrue(g._cached_grid is not None) - assert_allclose(result, expected_val, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3) def test_ill_cache(self): with self.assertWarns(UserWarning): diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py index 4fb534aba1..7f8c5826bc 100644 --- a/tests/test_rand_affine_grid.py +++ b/tests/test_rand_affine_grid.py @@ -201,7 +201,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val): result = g(**input_data) if "device" in input_data: self.assertEqual(result.device, input_data[device]) - assert_allclose(result, expected_val, type_test=False, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected_val, type_test=False, rtol=1e-3, atol=1e-3) if __name__ == "__main__": diff --git a/tests/test_rand_affined.py b/tests/test_rand_affined.py index 0109175b16..179cccbd4e 100644 --- a/tests/test_rand_affined.py +++ b/tests/test_rand_affined.py @@ -209,7 +209,7 @@ def test_rand_affined(self, input_param, input_data, expected_val): if "_transforms" in key: continue expected = expected_val[key] if isinstance(expected_val, dict) else expected_val - assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected, rtol=1e-3, atol=1e-3) g.set_random_state(4) res = g(input_data) diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py index c414eb1ffd..cb84d49b43 100644 --- a/tests/test_rand_elastic_2d.py +++ b/tests/test_rand_elastic_2d.py @@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val): g = Rand2DElastic(**input_param) g.set_random_state(123) result = g(**input_data) - assert_allclose(result, expected_val, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3) if __name__ == "__main__": From b1b87f3edeb3afb0e5522e92c266900ccee07e46 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 5 Oct 2021 09:30:36 +0100 Subject: [PATCH 02/11] temp tests Signed-off-by: Wenqi Li --- .github/workflows/pythonapp-gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index edaa2487ce..3326db38e7 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -6,6 +6,7 @@ on: branches: - main - releasing/* + - test-3071 pull_request: concurrency: @@ -103,7 +104,7 @@ jobs: # fixes preinstalled ruamel_yaml error from the docker image rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel* python -m pip install ${{ matrix.pytorch }} - python -m pip install -r requirements-dev.txt + python -m pip install -r requirements-dev.txt --user python -m pip list - name: Run quick tests (GPU) run: | From b3a727905af3472857b78f67d8e16feef05b90b6 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 5 Oct 2021 09:43:30 +0100 Subject: [PATCH 03/11] fixes #3071 Signed-off-by: Wenqi Li --- .github/workflows/pythonapp-gpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index 3326db38e7..c13d932598 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -104,7 +104,7 @@ jobs: # fixes preinstalled ruamel_yaml error from the docker image rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel* python -m pip install ${{ matrix.pytorch }} - python -m pip install -r requirements-dev.txt --user + python -m pip install -r requirements-dev.txt --ignore-installed ruamel_yaml python -m pip list - name: Run quick tests (GPU) run: | From 9a3b70e74ea15f9d5b6627c95de086ba6540296d Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 5 Oct 2021 09:54:53 +0100 Subject: [PATCH 04/11] remove temp tests Signed-off-by: Wenqi Li --- .github/workflows/pythonapp-gpu.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index c13d932598..edaa2487ce 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -6,7 +6,6 @@ on: branches: - main - releasing/* - - test-3071 pull_request: concurrency: @@ -104,7 +103,7 @@ jobs: # fixes preinstalled ruamel_yaml error from the docker image rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel* python -m pip install ${{ matrix.pytorch }} - python -m pip install -r requirements-dev.txt --ignore-installed ruamel_yaml + python -m pip install -r requirements-dev.txt python -m pip list - name: Run quick tests (GPU) run: | From a0a0905053e7eb576e02cb37922ce78182ad7034 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 5 Oct 2021 18:43:55 +0100 Subject: [PATCH 05/11] fixes tests Signed-off-by: Wenqi Li --- tests/test_affine_grid.py | 2 +- tests/test_affine_transform.py | 2 +- tests/test_create_grid_and_affine.py | 2 +- tests/test_lltm.py | 4 ++-- tests/test_rand_affine_grid.py | 2 +- tests/test_rand_elastic_2d.py | 2 +- tests/test_rand_elasticd_2d.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_affine_grid.py b/tests/test_affine_grid.py index 972cf20a1f..97ab6d4ebb 100644 --- a/tests/test_affine_grid.py +++ b/tests/test_affine_grid.py @@ -115,7 +115,7 @@ def test_affine_grid(self, input_param, input_data, expected_val): result, _ = g(**input_data) if "device" in input_data: self.assertEqual(result.device, input_data[device]) - assert_allclose(result, expected_val, type_test=False, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected_val, type_test=False, rtol=5e-2, atol=5e-2) if __name__ == "__main__": diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 689eb6169b..99df771670 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -205,7 +205,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=5e-3) def test_affine_transform_3d(self): t = np.pi / 3 diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py index b53eaa5b9d..b59f53bcc8 100644 --- a/tests/test_create_grid_and_affine.py +++ b/tests/test_create_grid_and_affine.py @@ -162,7 +162,7 @@ def test_assert(func, params, expected): m = func(*params, device="cuda:0", backend="torch") else: m = func(*params, backend=b) - assert_allclose(m, expected, type_test=False, atol=1e-7) + assert_allclose(m, expected, type_test=False, atol=1e-3, rtol=1e-3) class TestCreateAffine(unittest.TestCase): diff --git a/tests/test_lltm.py b/tests/test_lltm.py index f1311379bc..b53f0c80f3 100644 --- a/tests/test_lltm.py +++ b/tests/test_lltm.py @@ -50,8 +50,8 @@ def test_value_cuda(self, input_param, expected_h, expected_c): new_h, new_c = lltm(x, (h, c)) (new_h.sum() + new_c.sum()).backward() - torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.0001, atol=1e-04) - torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.0001, atol=1e-04) + torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.001, atol=0.001) + torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.001, atol=0.001) if __name__ == "__main__": diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py index 7f8c5826bc..18e13f5069 100644 --- a/tests/test_rand_affine_grid.py +++ b/tests/test_rand_affine_grid.py @@ -201,7 +201,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val): result = g(**input_data) if "device" in input_data: self.assertEqual(result.device, input_data[device]) - assert_allclose(result, expected_val, type_test=False, rtol=1e-3, atol=1e-3) + assert_allclose(result, expected_val, type_test=False, rtol=1e-2, atol=1e-2) if __name__ == "__main__": diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py index cb84d49b43..210be1ab25 100644 --- a/tests/test_rand_elastic_2d.py +++ b/tests/test_rand_elastic_2d.py @@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val): g = Rand2DElastic(**input_param) g.set_random_state(123) result = g(**input_data) - assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3) + assert_allclose(result, expected_val, rtol=1e-2, atol=1e-2) if __name__ == "__main__": diff --git a/tests/test_rand_elasticd_2d.py b/tests/test_rand_elasticd_2d.py index 84f18120e1..62ee66608b 100644 --- a/tests/test_rand_elasticd_2d.py +++ b/tests/test_rand_elasticd_2d.py @@ -164,7 +164,7 @@ def test_rand_2d_elasticd(self, input_param, input_data, expected_val): for key in res: result = res[key] expected = expected_val[key] if isinstance(expected_val, dict) else expected_val - assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + assert_allclose(result, expected, rtol=5e-3, atol=5e-3) if __name__ == "__main__": From d74565d003ecc8f4b7465d7e8f64e9bd334ce253 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 5 Oct 2021 20:46:44 +0100 Subject: [PATCH 06/11] update tests Signed-off-by: Wenqi Li --- tests/test_affine_transform.py | 2 +- tests/test_rand_elastic_2d.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 99df771670..edc680eaf9 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -255,7 +255,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=5e-3) def test_ill_affine_transform(self): with self.assertRaises(ValueError): # image too small diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py index 210be1ab25..769d87c940 100644 --- a/tests/test_rand_elastic_2d.py +++ b/tests/test_rand_elastic_2d.py @@ -110,7 +110,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val): g = Rand2DElastic(**input_param) g.set_random_state(123) result = g(**input_data) - assert_allclose(result, expected_val, rtol=1e-2, atol=1e-2) + assert_allclose(result, expected_val, rtol=5e-3, atol=5e-3) if __name__ == "__main__": From 1c01c2adf2248d9144030394f0d26d3edc1965d1 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 6 Oct 2021 09:47:17 +0100 Subject: [PATCH 07/11] add tf32 tests Signed-off-by: Wenqi Li --- tests/test_affine_grid.py | 6 ++++-- tests/test_affine_transform.py | 25 ++++++++++++++----------- tests/test_create_grid_and_affine.py | 4 ++-- tests/utils.py | 9 +++++++++ 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/test_affine_grid.py b/tests/test_affine_grid.py index 97ab6d4ebb..9bf2bcf90e 100644 --- a/tests/test_affine_grid.py +++ b/tests/test_affine_grid.py @@ -16,7 +16,7 @@ from parameterized import parameterized from monai.transforms import AffineGrid -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env TESTS = [] for p in TEST_NDARRAYS: @@ -107,6 +107,8 @@ ] ) +_rtol = 5e-2 if is_tf32_env() else 1e-4 + class TestAffineGrid(unittest.TestCase): @parameterized.expand(TESTS) @@ -115,7 +117,7 @@ def test_affine_grid(self, input_param, input_data, expected_val): result, _ = g(**input_data) if "device" in input_data: self.assertEqual(result.device, input_data[device]) - assert_allclose(result, expected_val, type_test=False, rtol=5e-2, atol=5e-2) + assert_allclose(result, expected_val, type_test=False, rtol=_rtol) if __name__ == "__main__": diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index edc680eaf9..5c2637cec8 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -17,6 +17,9 @@ from monai.networks import normalize_transform, to_norm_affine from monai.networks.layers import AffineTransform +from tests.utils import is_tf32_env + +_rtol = 1e-4 if not is_tf32_env() else 5e-3 TEST_NORM_CASES = [ [(4, 5), True, [[[0.666667, 0, -1], [0, 0.5, -1], [0, 0, 1]]]], @@ -95,7 +98,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) new_affine = to_norm_affine(affine, src_size, dst_size, align_corners) new_affine = new_affine.detach().cpu().numpy() - np.testing.assert_allclose(new_affine, expected, atol=1e-3, rtol=1e-3) + np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=1e-3) @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES) def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners): @@ -113,7 +116,7 @@ def test_affine_shift(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_affine_shift_1(self): affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]]) @@ -121,7 +124,7 @@ def test_affine_shift_1(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_affine_shift_2(self): affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]]) @@ -129,14 +132,14 @@ def test_affine_shift_2(self): out = AffineTransform()(image, affine) out = out.detach().cpu().numpy() expected = [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_zoom(self): affine = torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]]) image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform((3, 2))(image, affine) expected = [[[[1, 3], [5, 7], [9, 11]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_zoom_1(self): affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) @@ -150,7 +153,7 @@ def test_zoom_2(self): image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform((1, 2))(image, affine) expected = [[[[1, 3]]]] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_affine_transform_minimum(self): t = np.pi / 3 @@ -169,7 +172,7 @@ def test_affine_transform_minimum(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-3, rtol=_rtol) def test_affine_transform_2d(self): t = np.pi / 3 @@ -188,7 +191,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-3, rtol=_rtol) if torch.cuda.is_available(): affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) @@ -205,7 +208,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=5e-3) + np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol) def test_affine_transform_3d(self): t = np.pi / 3 @@ -231,7 +234,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol) if torch.cuda.is_available(): affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) @@ -255,7 +258,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=5e-3) + np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol) def test_ill_affine_transform(self): with self.assertRaises(ValueError): # image too small diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py index b59f53bcc8..eb0236452b 100644 --- a/tests/test_create_grid_and_affine.py +++ b/tests/test_create_grid_and_affine.py @@ -22,7 +22,7 @@ create_shear, create_translate, ) -from tests.utils import assert_allclose +from tests.utils import assert_allclose, is_tf32_env class TestCreateGrid(unittest.TestCase): @@ -162,7 +162,7 @@ def test_assert(func, params, expected): m = func(*params, device="cuda:0", backend="torch") else: m = func(*params, backend=b) - assert_allclose(m, expected, type_test=False, atol=1e-3, rtol=1e-3) + assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5) class TestCreateAffine(unittest.TestCase): diff --git a/tests/utils.py b/tests/utils.py index b7e32068c3..e9633aaa0a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -104,6 +104,15 @@ def test_is_quick(): return os.environ.get(quick_test_var, "").lower() == "true" +def is_tf32_env(): + """ + The environment variable NVIDIA_TF32_OVERRIDE=0 will override any defaults + or programmatic configuration of NVIDIA libraries, and consequently, + cuBLAS will not accelerate FP32 computations with TF32 tensor cores. + """ + return os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0" + + def skip_if_quick(obj): """ Skip the unit tests if environment variable `quick_test_var=true`. From 22c4b535f187beb9d5afe50fcb81c632e459519d Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 6 Oct 2021 09:59:30 +0100 Subject: [PATCH 08/11] fixes rtol wit tf32 Signed-off-by: Wenqi Li --- tests/test_affine_transform.py | 4 ++-- tests/test_create_grid_and_affine.py | 2 +- tests/test_lltm.py | 8 +++++--- tests/test_rand_affine.py | 6 ++++-- tests/test_rand_affine_grid.py | 6 ++++-- tests/test_rand_affined.py | 6 ++++-- tests/test_rand_elastic_2d.py | 6 ++++-- tests/test_rand_elasticd_2d.py | 6 ++++-- 8 files changed, 28 insertions(+), 16 deletions(-) diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 5c2637cec8..8e27dca076 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -98,7 +98,7 @@ def test_to_norm_affine(self, affine, src_size, dst_size, align_corners, expecte affine = torch.as_tensor(affine, device=torch.device("cuda:0"), dtype=torch.float32) new_affine = to_norm_affine(affine, src_size, dst_size, align_corners) new_affine = new_affine.detach().cpu().numpy() - np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=1e-3) + np.testing.assert_allclose(new_affine, expected, atol=1e-5, rtol=_rtol) @parameterized.expand(TEST_ILL_TO_NORM_AFFINE_CASES) def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners): @@ -146,7 +146,7 @@ def test_zoom_1(self): image = torch.arange(1.0, 13.0).view(1, 1, 3, 4).to(device=torch.device("cpu:0")) out = AffineTransform()(image, affine, (1, 4)) expected = [[[[1, 2, 3, 4]]]] - np.testing.assert_allclose(out, expected, atol=1e-3) + np.testing.assert_allclose(out, expected, atol=_rtol) def test_zoom_2(self): affine = torch.as_tensor([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0]], dtype=torch.float32) diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py index eb0236452b..917f01ff96 100644 --- a/tests/test_create_grid_and_affine.py +++ b/tests/test_create_grid_and_affine.py @@ -162,7 +162,7 @@ def test_assert(func, params, expected): m = func(*params, device="cuda:0", backend="torch") else: m = func(*params, backend=b) - assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5) + assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5, atol=1e-5) class TestCreateAffine(unittest.TestCase): diff --git a/tests/test_lltm.py b/tests/test_lltm.py index b53f0c80f3..4186c91246 100644 --- a/tests/test_lltm.py +++ b/tests/test_lltm.py @@ -15,7 +15,9 @@ from parameterized import parameterized from monai.networks.layers import LLTM -from tests.utils import SkipIfNoModule +from tests.utils import SkipIfNoModule, is_tf32_env + +_rtol = 0.001 if is_tf32_env() else 0.0001 TEST_CASE_1 = [ {"input_features": 32, "state_size": 2}, @@ -50,8 +52,8 @@ def test_value_cuda(self, input_param, expected_h, expected_c): new_h, new_c = lltm(x, (h, c)) (new_h.sum() + new_c.sum()).backward() - torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=0.001, atol=0.001) - torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=0.001, atol=0.001) + torch.testing.assert_allclose(new_h, expected_h.to(device), rtol=_rtol, atol=0.001) + torch.testing.assert_allclose(new_c, expected_c.to(device), rtol=_rtol, atol=0.001) if __name__ == "__main__": diff --git a/tests/test_rand_affine.py b/tests/test_rand_affine.py index f551bb4b43..96322813c9 100644 --- a/tests/test_rand_affine.py +++ b/tests/test_rand_affine.py @@ -17,7 +17,9 @@ from monai.transforms import RandAffine from monai.utils.type_conversion import convert_data_type -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env + +_rtol = 1e-3 if is_tf32_env() else 1e-4 TESTS = [] for p in TEST_NDARRAYS: @@ -141,7 +143,7 @@ def test_rand_affine(self, input_param, input_data, expected_val): result = g(**input_data) if input_param.get("cache_grid", False): self.assertTrue(g._cached_grid is not None) - assert_allclose(result, expected_val, rtol=1e-3, atol=1e-3) + assert_allclose(result, expected_val, rtol=_rtol, atol=1e-4) def test_ill_cache(self): with self.assertWarns(UserWarning): diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py index 18e13f5069..07e94a5ded 100644 --- a/tests/test_rand_affine_grid.py +++ b/tests/test_rand_affine_grid.py @@ -16,7 +16,9 @@ from parameterized import parameterized from monai.transforms import RandAffineGrid -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env + +_rtol = 1e-2 if is_tf32_env else 1e-4 TESTS = [] for p in TEST_NDARRAYS: @@ -201,7 +203,7 @@ def test_rand_affine_grid(self, input_param, input_data, expected_val): result = g(**input_data) if "device" in input_data: self.assertEqual(result.device, input_data[device]) - assert_allclose(result, expected_val, type_test=False, rtol=1e-2, atol=1e-2) + assert_allclose(result, expected_val, type_test=False, rtol=_rtol, atol=1e-4) if __name__ == "__main__": diff --git a/tests/test_rand_affined.py b/tests/test_rand_affined.py index 179cccbd4e..651452ab07 100644 --- a/tests/test_rand_affined.py +++ b/tests/test_rand_affined.py @@ -17,7 +17,9 @@ from monai.transforms import RandAffined from monai.utils import GridSampleMode -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env + +_rtol = 1e-3 if is_tf32_env() else 1e-4 TESTS = [] for p in TEST_NDARRAYS: @@ -209,7 +211,7 @@ def test_rand_affined(self, input_param, input_data, expected_val): if "_transforms" in key: continue expected = expected_val[key] if isinstance(expected_val, dict) else expected_val - assert_allclose(result, expected, rtol=1e-3, atol=1e-3) + assert_allclose(result, expected, rtol=_rtol, atol=1e-3) g.set_random_state(4) res = g(input_data) diff --git a/tests/test_rand_elastic_2d.py b/tests/test_rand_elastic_2d.py index 769d87c940..22920d0f35 100644 --- a/tests/test_rand_elastic_2d.py +++ b/tests/test_rand_elastic_2d.py @@ -16,7 +16,9 @@ from parameterized import parameterized from monai.transforms import Rand2DElastic -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env + +_rtol = 5e-3 if is_tf32_env() else 1e-4 TESTS = [] for p in TEST_NDARRAYS: @@ -110,7 +112,7 @@ def test_rand_2d_elastic(self, input_param, input_data, expected_val): g = Rand2DElastic(**input_param) g.set_random_state(123) result = g(**input_data) - assert_allclose(result, expected_val, rtol=5e-3, atol=5e-3) + assert_allclose(result, expected_val, rtol=_rtol, atol=1e-4) if __name__ == "__main__": diff --git a/tests/test_rand_elasticd_2d.py b/tests/test_rand_elasticd_2d.py index 62ee66608b..77e6489d50 100644 --- a/tests/test_rand_elasticd_2d.py +++ b/tests/test_rand_elasticd_2d.py @@ -16,7 +16,9 @@ from parameterized import parameterized from monai.transforms import Rand2DElasticd -from tests.utils import TEST_NDARRAYS, assert_allclose +from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env + +_rtol = 5e-3 if is_tf32_env() else 1e-4 TESTS = [] for p in TEST_NDARRAYS: @@ -164,7 +166,7 @@ def test_rand_2d_elasticd(self, input_param, input_data, expected_val): for key in res: result = res[key] expected = expected_val[key] if isinstance(expected_val, dict) else expected_val - assert_allclose(result, expected, rtol=5e-3, atol=5e-3) + assert_allclose(result, expected, rtol=_rtol, atol=5e-3) if __name__ == "__main__": From 29dd41ce7011a1e4b8842604f4c0ee1b8fa809c4 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 6 Oct 2021 11:11:02 +0100 Subject: [PATCH 09/11] detect tf32 Signed-off-by: Wenqi Li --- monai/__init__.py | 2 +- tests/__init__.py | 16 ++++++++++++++++ tests/utils.py | 10 +++++----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/monai/__init__.py b/monai/__init__.py index 7ab30bcae7..5043208b9c 100644 --- a/monai/__init__.py +++ b/monai/__init__.py @@ -26,7 +26,7 @@ __basedir__ = os.path.dirname(__file__) -if not (sys.version_info.major == PY_REQUIRED_MAJOR and sys.version_info.minor >= PY_REQUIRED_MINOR): +if sys.version_info.major != PY_REQUIRED_MAJOR or sys.version_info.minor < PY_REQUIRED_MINOR: raise RuntimeError( "MONAI requires Python {}.{} or higher. But the current Python is: {}".format( PY_REQUIRED_MAJOR, PY_REQUIRED_MINOR, sys.version diff --git a/tests/__init__.py b/tests/__init__.py index 5093d1f72d..215e677d45 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -9,10 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import sys import unittest import warnings +import torch + def _enter_pr_4800(self): """ @@ -35,3 +38,16 @@ def _enter_pr_4800(self): unittest.case._AssertWarnsContext.__enter__ = _enter_pr_4800 # type: ignore except AttributeError: pass + + +_tf32_enabled: bool = False +if torch.cuda.is_available() and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0": + try: + # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result + g_gpu = torch.Generator(device="cuda") + g_gpu.manual_seed(2147483647) + a_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu) + b_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu) + _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01 # 0.1713 + except BaseException: + pass diff --git a/tests/utils.py b/tests/utils.py index e9633aaa0a..664f47419a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -25,7 +25,7 @@ from io import BytesIO from subprocess import PIPE, Popen from typing import Callable, Optional, Tuple -from urllib.error import ContentTooShortError, HTTPError, URLError +from urllib.error import HTTPError, URLError import numpy as np import torch @@ -39,6 +39,7 @@ from monai.utils.misc import is_module_ver_at_least from monai.utils.module import version_leq from monai.utils.type_conversion import convert_data_type +from tests import _tf32_enabled nib, _ = optional_import("nibabel") @@ -94,10 +95,9 @@ def assert_allclose( def test_pretrained_networks(network, input_param, device): try: - net = network(**input_param).to(device) - except (URLError, HTTPError, ContentTooShortError) as e: + return network(**input_param).to(device) + except (URLError, HTTPError) as e: raise unittest.SkipTest(e) from e - return net def test_is_quick(): @@ -110,7 +110,7 @@ def is_tf32_env(): or programmatic configuration of NVIDIA libraries, and consequently, cuBLAS will not accelerate FP32 computations with TF32 tensor cores. """ - return os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0" + return _tf32_enabled def skip_if_quick(obj): From 15a673dc876be25892fb670c0fe90ed309ebaaff Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 6 Oct 2021 06:24:19 -0400 Subject: [PATCH 10/11] update flag Signed-off-by: Wenqi Li --- tests/__init__.py | 7 ++++++- tests/test_affine_transform.py | 4 ++-- tests/test_create_grid_and_affine.py | 2 +- tests/test_rand_affine_grid.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 215e677d45..9577ff6370 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -41,7 +41,11 @@ def _enter_pr_4800(self): _tf32_enabled: bool = False -if torch.cuda.is_available() and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0": +if ( + torch.cuda.is_available() + and f"{torch.version.cuda}".startswith("11") + and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0" +): try: # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result g_gpu = torch.Generator(device="cuda") @@ -51,3 +55,4 @@ def _enter_pr_4800(self): _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01 # 0.1713 except BaseException: pass +print(f"tf32 enabled: {_tf32_enabled}") diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 8e27dca076..ef39c297ce 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -208,7 +208,7 @@ def test_affine_transform_2d(self): ] ] ] - np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol) + np.testing.assert_allclose(out, expected, atol=5e-3) def test_affine_transform_3d(self): t = np.pi / 3 @@ -258,7 +258,7 @@ def test_affine_transform_3d(self): ] ], ] - np.testing.assert_allclose(out, expected, atol=1e-4, rtol=_rtol) + np.testing.assert_allclose(out, expected, atol=5e-3) def test_ill_affine_transform(self): with self.assertRaises(ValueError): # image too small diff --git a/tests/test_create_grid_and_affine.py b/tests/test_create_grid_and_affine.py index 917f01ff96..cd8d75f63e 100644 --- a/tests/test_create_grid_and_affine.py +++ b/tests/test_create_grid_and_affine.py @@ -162,7 +162,7 @@ def test_assert(func, params, expected): m = func(*params, device="cuda:0", backend="torch") else: m = func(*params, backend=b) - assert_allclose(m, expected, type_test=False, rtol=1e-3 if is_tf32_env() else 1e-5, atol=1e-5) + assert_allclose(m, expected, type_test=False, rtol=1e-2 if is_tf32_env() else 1e-5, atol=1e-5) class TestCreateAffine(unittest.TestCase): diff --git a/tests/test_rand_affine_grid.py b/tests/test_rand_affine_grid.py index 07e94a5ded..ade615cd65 100644 --- a/tests/test_rand_affine_grid.py +++ b/tests/test_rand_affine_grid.py @@ -18,7 +18,7 @@ from monai.transforms import RandAffineGrid from tests.utils import TEST_NDARRAYS, assert_allclose, is_tf32_env -_rtol = 1e-2 if is_tf32_env else 1e-4 +_rtol = 1e-1 if is_tf32_env else 1e-4 TESTS = [] for p in TEST_NDARRAYS: From 2ab64d3d3e87ca45ace2fde2c9617a4b4f9efce1 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 6 Oct 2021 07:18:10 -0400 Subject: [PATCH 11/11] fixes testing util Signed-off-by: Wenqi Li --- tests/__init__.py | 21 --------------------- tests/utils.py | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 9577ff6370..5093d1f72d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -9,13 +9,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import sys import unittest import warnings -import torch - def _enter_pr_4800(self): """ @@ -38,21 +35,3 @@ def _enter_pr_4800(self): unittest.case._AssertWarnsContext.__enter__ = _enter_pr_4800 # type: ignore except AttributeError: pass - - -_tf32_enabled: bool = False -if ( - torch.cuda.is_available() - and f"{torch.version.cuda}".startswith("11") - and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0" -): - try: - # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result - g_gpu = torch.Generator(device="cuda") - g_gpu.manual_seed(2147483647) - a_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu) - b_full = torch.randn(10240, 10240, dtype=torch.double, device="cuda", generator=g_gpu) - _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.01 # 0.1713 - except BaseException: - pass -print(f"tf32 enabled: {_tf32_enabled}") diff --git a/tests/utils.py b/tests/utils.py index 664f47419a..833d5d2cc0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,11 +39,11 @@ from monai.utils.misc import is_module_ver_at_least from monai.utils.module import version_leq from monai.utils.type_conversion import convert_data_type -from tests import _tf32_enabled nib, _ = optional_import("nibabel") quick_test_var = "QUICKTEST" +_tf32_enabled = None def clone(data: NdarrayTensor) -> NdarrayTensor: @@ -110,6 +110,25 @@ def is_tf32_env(): or programmatic configuration of NVIDIA libraries, and consequently, cuBLAS will not accelerate FP32 computations with TF32 tensor cores. """ + global _tf32_enabled + if _tf32_enabled is None: + _tf32_enabled = False + if ( + torch.cuda.is_available() + and f"{torch.version.cuda}".startswith("11") + and os.environ.get("NVIDIA_TF32_OVERRIDE", "1") != "0" + and torch.cuda.device_count() > 0 + ): + try: + # with TF32 enabled, the speed is ~8x faster, but the precision has ~2 digits less in the result + g_gpu = torch.Generator(device="cuda") + g_gpu.manual_seed(2147483647) + a_full = torch.randn(1024, 1024, dtype=torch.double, device="cuda", generator=g_gpu) + b_full = torch.randn(1024, 1024, dtype=torch.double, device="cuda", generator=g_gpu) + _tf32_enabled = (a_full.float() @ b_full.float() - a_full @ b_full).abs().max().item() > 0.001 # 0.1713 + except BaseException: + pass + print(f"tf32 enabled: {_tf32_enabled}") return _tf32_enabled