From 6cd01ab1b3812469fad671d26d86bfe004493eae Mon Sep 17 00:00:00 2001 From: jackalcooper Date: Thu, 24 Jun 2021 08:23:18 +0000 Subject: [PATCH 1/6] Don't build test by default --- CMakeLists.txt | 2 +- docker/package/manylinux/build_wheel.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32545e05a6b..56f3540928a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ endif() option(USE_CLANG_FORMAT "" OFF) option(BUILD_RDMA "" OFF) option(BUILD_CUDA "" ON) -option(BUILD_TESTING "" ON) +option(BUILD_TESTING "" OFF) option(WITH_XLA "Option to build with XLA" OFF) option(WITH_TENSORRT "Option to build with TensorRT" OFF) option(BUILD_GIT_VERSION "" ON) diff --git a/docker/package/manylinux/build_wheel.py b/docker/package/manylinux/build_wheel.py index 96bc68bed74..dbb544d5269 100644 --- a/docker/package/manylinux/build_wheel.py +++ b/docker/package/manylinux/build_wheel.py @@ -379,6 +379,7 @@ def build(): if cuda_version in ["11.0", "11.1"]: versioned_img_tag = f"{img_prefix}:0.2" enforced_oneflow_cmake_args = "" + enforced_oneflow_cmake_args += " -DBUILD_TESTING=ON" if float(cuda_version) >= 11: assert ( "CUDNN_STATIC" not in extra_oneflow_cmake_args From ba36e1143d8331c0a73175bbd45a4de0bf9e0dc6 Mon Sep 17 00:00:00 2001 From: tsai Date: Thu, 24 Jun 2021 17:20:27 +0800 Subject: [PATCH 2/6] refine --- .github/workflows/simple.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml index 60782de5059..4432cb284b2 100644 --- a/.github/workflows/simple.yml +++ b/.github/workflows/simple.yml @@ -74,6 +74,7 @@ jobs: set -x cmake_flags="" cmake_flags+=" -DBUILD_CUDA=OFF" + cmake_flags+=" -DBUILD_TESTING=ON" cmake_flags+=" -G '${{ matrix.cmake_generator }}'" cmake_flags+=" -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}" cmake_flags+=" -DBUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}" From f0f371a801a619c0d63a673f1bc4ac0baf4f6a06 Mon Sep 17 00:00:00 2001 From: tsai Date: Thu, 24 Jun 2021 18:04:36 +0800 Subject: [PATCH 3/6] Retry distributed run for 3 times to prevent failure --- .github/workflows/test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 58d2ee96c26..a55a0c93d25 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -283,7 +283,7 @@ jobs: "xla", "cuda_op_eager", "cpu_op_eager", - "xla_cpu" + "xla_cpu", ] include: - test_suite: "cuda" @@ -394,14 +394,14 @@ jobs: docker run $extra_docker_args \ ${image_name} \ bash -c "bash ci/test/try_install.sh && bash ci/test/build_docs.sh" - - name: Op test (distributed) + - name: Op test (distributed, will try 3 times) if: matrix.test_suite == 'cuda' run: | set -x - python3 ci/test/distributed_run.py \ - --bash_script=ci/test/2node_op_test.sh \ - --custom_img_tag=${image_name} \ - --oneflow_wheel_path=${wheelhouse_dir} + (echo "try 1" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ + (echo "try 2" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ + (echo "try 3" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ + (echo "distributed test failed" && exit 1) - name: Upload log (distributed test) if: failure() && matrix.test_suite == 'cuda' uses: ./.github/actions/upload_oss From f4734b9ff5507806b3ae21babc5877b9665c7e39 Mon Sep 17 00:00:00 2001 From: tsai Date: Thu, 24 Jun 2021 18:11:47 +0800 Subject: [PATCH 4/6] refine --- .github/workflows/test.yml | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a55a0c93d25..45714eb11d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -394,14 +394,32 @@ jobs: docker run $extra_docker_args \ ${image_name} \ bash -c "bash ci/test/try_install.sh && bash ci/test/build_docs.sh" - - name: Op test (distributed, will try 3 times) + - name: Op test (distributed, 1st try) if: matrix.test_suite == 'cuda' + continue-on-error: true + id: distributed_try_1 + run: | + python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${{ env.image_name }} --oneflow_wheel_path=${{ env.wheelhouse_dir }} + - name: Op test (distributed, 2nd try) + if: matrix.test_suite == 'cuda' && steps.distributed_try_1.outcome=='failure' + continue-on-error: true + id: distributed_try_2 + run: | + python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${{ env.image_name }} --oneflow_wheel_path=${{ env.wheelhouse_dir }} + - name: Op test (distributed, 3rd try) + if: matrix.test_suite == 'cuda' && steps.distributed_try_2.outcome=='failure' + continue-on-error: true + id: distributed_try_3 + run: | + python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${{ env.image_name }} --oneflow_wheel_path=${{ env.wheelhouse_dir }} + - name: Op test (distributed, check success) + if: always() run: | - set -x - (echo "try 1" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ - (echo "try 2" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ - (echo "try 3" && python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${image_name} --oneflow_wheel_path=${wheelhouse_dir}) || \ - (echo "distributed test failed" && exit 1) + if ${{ steps.distributed_try_1.outcome=='success' || steps.distributed_try_2.outcome=='success' || steps.distributed_try_3.outcome=='success' }}; then + echo success + else + exit 1 + fi - name: Upload log (distributed test) if: failure() && matrix.test_suite == 'cuda' uses: ./.github/actions/upload_oss From 45de1b1cf485d32391402b220d26c2db49cd1e00 Mon Sep 17 00:00:00 2001 From: tsai Date: Thu, 24 Jun 2021 18:14:09 +0800 Subject: [PATCH 5/6] revert changes unwanted --- .github/workflows/simple.yml | 1 - CMakeLists.txt | 2 +- docker/package/manylinux/build_wheel.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml index 4432cb284b2..60782de5059 100644 --- a/.github/workflows/simple.yml +++ b/.github/workflows/simple.yml @@ -74,7 +74,6 @@ jobs: set -x cmake_flags="" cmake_flags+=" -DBUILD_CUDA=OFF" - cmake_flags+=" -DBUILD_TESTING=ON" cmake_flags+=" -G '${{ matrix.cmake_generator }}'" cmake_flags+=" -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}" cmake_flags+=" -DBUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}" diff --git a/CMakeLists.txt b/CMakeLists.txt index 56f3540928a..32545e05a6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ endif() option(USE_CLANG_FORMAT "" OFF) option(BUILD_RDMA "" OFF) option(BUILD_CUDA "" ON) -option(BUILD_TESTING "" OFF) +option(BUILD_TESTING "" ON) option(WITH_XLA "Option to build with XLA" OFF) option(WITH_TENSORRT "Option to build with TensorRT" OFF) option(BUILD_GIT_VERSION "" ON) diff --git a/docker/package/manylinux/build_wheel.py b/docker/package/manylinux/build_wheel.py index dbb544d5269..96bc68bed74 100644 --- a/docker/package/manylinux/build_wheel.py +++ b/docker/package/manylinux/build_wheel.py @@ -379,7 +379,6 @@ def build(): if cuda_version in ["11.0", "11.1"]: versioned_img_tag = f"{img_prefix}:0.2" enforced_oneflow_cmake_args = "" - enforced_oneflow_cmake_args += " -DBUILD_TESTING=ON" if float(cuda_version) >= 11: assert ( "CUDNN_STATIC" not in extra_oneflow_cmake_args From 4e609f97b631f011bdb2768430561e3982352418 Mon Sep 17 00:00:00 2001 From: tsai Date: Thu, 24 Jun 2021 19:01:52 +0800 Subject: [PATCH 6/6] address review --- .github/workflows/test.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 45714eb11d0..feef1a656a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -408,18 +408,10 @@ jobs: python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${{ env.image_name }} --oneflow_wheel_path=${{ env.wheelhouse_dir }} - name: Op test (distributed, 3rd try) if: matrix.test_suite == 'cuda' && steps.distributed_try_2.outcome=='failure' - continue-on-error: true + continue-on-error: false id: distributed_try_3 run: | python3 ci/test/distributed_run.py --bash_script=ci/test/2node_op_test.sh --custom_img_tag=${{ env.image_name }} --oneflow_wheel_path=${{ env.wheelhouse_dir }} - - name: Op test (distributed, check success) - if: always() - run: | - if ${{ steps.distributed_try_1.outcome=='success' || steps.distributed_try_2.outcome=='success' || steps.distributed_try_3.outcome=='success' }}; then - echo success - else - exit 1 - fi - name: Upload log (distributed test) if: failure() && matrix.test_suite == 'cuda' uses: ./.github/actions/upload_oss