diff --git a/ci/core.sh b/ci/core.sh index 58c3991f8..1d898e28c 100755 --- a/ci/core.sh +++ b/ci/core.sh @@ -24,9 +24,12 @@ if [ $rc -ne 0 ]; then exit $rc fi -echo ===== Run non GEMM tests ===== -ctest --test-dir build -j4 --output-on-failure -E "OperatorTest/GEMMTestSuite" -test $? -eq 0 || test_run_error "non-GEMM" +check_test_filter "nongemm" +if [ $? -eq 0 ]; then + echo ===== Run non GEMM tests ===== + ctest --test-dir build -j4 --output-on-failure -E "OperatorTest/GEMMTestSuite" + test $? -eq 0 || test_run_error "non-GEMM" +fi for _gemm in hipblaslt rocblas; do configure_gemm_env $_gemm || continue @@ -34,9 +37,12 @@ for _gemm in hipblaslt rocblas; do if [ $_gemm = "hipblaslt" ]; then _exclude="-E Test(.*bf16/.*X.X1|.*fp8.*fp16/.*X1X0|.*fp8.*X.X1|.*fp8/|.*bf8/)" fi - echo ===== Run GEMM $_gemm tests ===== - ctest --test-dir build -j4 --output-on-failure -R "OperatorTest/GEMMTestSuite" $_exclude - test $? -eq 0 || test_run_error "GEMM $_gemm" + check_test_filter $_gemm + if [ $? -eq 0 ]; then + echo ===== Run GEMM $_gemm tests ===== + ctest --test-dir build -j4 --output-on-failure -R "OperatorTest/GEMMTestSuite" $_exclude + test $? -eq 0 || test_run_error "GEMM $_gemm" + fi done return_run_results diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu index dab4a77a6..fa4bdbd47 100644 --- a/tests/cpp/operator/test_cublaslt_gemm.cu +++ b/tests/cpp/operator/test_cublaslt_gemm.cu @@ -155,7 +155,13 @@ void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, c } #endif - Tensor Workspace({ 33554432 }, DType::kByte); + size_t workspace_size = 33554432; +#ifdef __HIP_PLATFORM_AMD__ + if (prop.major == 9 && prop.minor == 5) { + workspace_size = 67108864; + } +#endif + Tensor Workspace({ workspace_size }, DType::kByte); //perform the gemm in GPU nvte_cublas_gemm(A.data(), @@ -212,6 +218,18 @@ void performTest(bool use_bias, bool use_gelu, const size_t m, const size_t k, c if (dtype == DType::kFloat32) { atol = 1e-5; } +#ifdef __HIP_PLATFORM_AMD__ + if (prop.major == 9 && prop.minor == 5) + { + // relax for certain gemm with hipblaslt + if (!isFp8Type(dtype) && (isFp8Type(atype) or isFp8Type(btype))) { + atol = 5e-4; + rtol = 5e-3; + } else if (dtype == DType::kFloat32) { + rtol = 1e-5; + } + } +#endif compareResults("D", D, ref_D.get(), atol, rtol); if(use_gelu){ diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index afa63fc9b..d0398567e 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -1972,8 +1972,9 @@ def test_transformer_layer_hidden_states_format(dtype, bs, model): if IS_HIP_EXTENSION: if use_hipblaslt(): tols = dtype_tols(dtype) - if dtype in (torch.float16, torch.bfloat16) and is_mi308(): - # mi308 hipblaslt precision issue + if dtype in (torch.float16, torch.bfloat16): + # On some GPUs hipblaslt results for SBHD and BSHD are different + # that results in lower final result precision tols["atol"] = 2e-3 _, use_aotriton, use_ck = rocm_attn_backend() if use_aotriton and not use_ck: diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 5fb765584..20ffbe611 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -255,7 +255,7 @@ else() set(__AOTRITON_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/aotriton") set(__AOTRITON_SUFFIX "_TEprivate") if(NOT DEFINED AOTRITON_PATH) - # # Install aotriton fused attn + # Install aotriton fused attn if(USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS) set(AOTRITON_NOIMAGE_MODE OFF) else() @@ -271,51 +271,60 @@ else() foreach(X IN LISTS CMAKE_HIP_ARCHITECTURES) set(key ${X}) string(APPEND key "_key") - string(APPEND aotriton_target_gpus ${${key}}) - string(APPEND aotriton_target_gpus "|") + set(gpu ${${key}}) + if (gpu) + string(APPEND aotriton_target_gpus "${gpu}|") + else() + message(WARNING "AOTriton building is not supported for ${X}") + endif() endforeach() endmacro() translate_arch_to_gpu_names(aotriton_target_gpus) - include(ExternalProject) - ExternalProject_Add(aotriton_external - SOURCE_DIR ../../3rdparty/aotriton - LIST_SEPARATOR | - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR} - -DTARGET_GPUS=${aotriton_target_gpus} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DAOTRITON_NO_PYTHON=ON - -DAOTRITON_NAME_SUFFIX=${__AOTRITON_SUFFIX} - -DAOTRITON_NOIMAGE_MODE=${AOTRITON_NOIMAGE_MODE} - BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton${__AOTRITON_SUFFIX}_v2.so" - ) - add_library(aotriton INTERFACE) - add_dependencies(aotriton aotriton_external) - target_link_libraries(aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton${__AOTRITON_SUFFIX}_v2.so) - target_include_directories(aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include) - if(NOT USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS) - set(__AOTRITON_VER "0.8.2b") - set(__AOTRITON_SHA256 "66445e6b0209b9f4080743b839cc9d424054dc5c8d07363f9f27f109231c324a") - string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" - "${__AOTRITON_VER}/aotriton-" - "${__AOTRITON_VER}-manylinux_2_28" - "_x86_64-rocm6.2" - "-shared.tar.gz") - ExternalProject_Add(aotriton_images - URL "${__AOTRITON_URL}" - URL_HASH SHA256=${__AOTRITON_SHA256} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory - "${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball/lib/aotriton.images" - "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images") - add_dependencies(aotriton aotriton_images) + if (NOT aotriton_target_gpus) + set(USE_FUSED_ATTN_AOTRITON FALSE) + message(WARNING "Disable AOTriton building because no supported GPU targets found") + else() + include(ExternalProject) + ExternalProject_Add(aotriton_external + SOURCE_DIR ../../3rdparty/aotriton + LIST_SEPARATOR | + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR} + -DTARGET_GPUS=${aotriton_target_gpus} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DAOTRITON_NO_PYTHON=ON + -DAOTRITON_NAME_SUFFIX=${__AOTRITON_SUFFIX} + -DAOTRITON_NOIMAGE_MODE=${AOTRITON_NOIMAGE_MODE} + BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton${__AOTRITON_SUFFIX}_v2.so" + ) + add_library(aotriton INTERFACE) + add_dependencies(aotriton aotriton_external) + target_link_libraries(aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton${__AOTRITON_SUFFIX}_v2.so) + target_include_directories(aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include) + if(NOT USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS) + set(__AOTRITON_VER "0.8.2b") + set(__AOTRITON_SHA256 "66445e6b0209b9f4080743b839cc9d424054dc5c8d07363f9f27f109231c324a") + string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" + "${__AOTRITON_VER}/aotriton-" + "${__AOTRITON_VER}-manylinux_2_28" + "_x86_64-rocm6.2" + "-shared.tar.gz") + ExternalProject_Add(aotriton_images + URL "${__AOTRITON_URL}" + URL_HASH SHA256=${__AOTRITON_SHA256} + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball/lib/aotriton.images" + "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images") + add_dependencies(aotriton aotriton_images) + endif() + install(DIRECTORY + ${__AOTRITON_INSTALL_DIR}/lib + DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine + PATTERN "cmake" EXCLUDE + PATTERN "libaotriton${__AOTRITON_SUFFIX}_v2.so" EXCLUDE) endif() - install(DIRECTORY - ${__AOTRITON_INSTALL_DIR}/lib - DESTINATION ${CMAKE_INSTALL_PREFIX}/transformer_engine - PATTERN "cmake" EXCLUDE - PATTERN "libaotriton${__AOTRITON_SUFFIX}_v2.so" EXCLUDE) else() # Use aotriton built during initial TE building/installation # When only need rebuild TE library itself diff --git a/transformer_engine/common/amd_detail/hip_float8.h b/transformer_engine/common/amd_detail/hip_float8.h index f135945ed..66df949dc 100644 --- a/transformer_engine/common/amd_detail/hip_float8.h +++ b/transformer_engine/common/amd_detail/hip_float8.h @@ -88,7 +88,7 @@ struct te_hip_fp8_e4m3 { __host__ __device__ operator float() const { return data.operator float(); } - __host__ __device__ te_hip_fp8_e4m3(const float& v) { data = v;} + __host__ __device__ te_hip_fp8_e4m3(const float& v): data(v) {} }; static_assert(sizeof(te_hip_fp8_e4m3) == 1, "Size mismatch"); diff --git a/transformer_engine/common/recipe/delayed_scaling.cu b/transformer_engine/common/recipe/delayed_scaling.cu index a86cc4235..9e79c8ed1 100644 --- a/transformer_engine/common/recipe/delayed_scaling.cu +++ b/transformer_engine/common/recipe/delayed_scaling.cu @@ -36,8 +36,10 @@ inline float fp8_dtype_max(DType dtype) { case DType::kFloat8E4M3: #ifndef __HIP_PLATFORM_AMD__ return 448; -#else +#elif HIP_VERSION >= 60300000 return te_fp8_fnuz() ? 240 : 448; +#else + return 240; // default to true for older versions compatibility #endif case DType::kFloat8E5M2: return 57344; diff --git a/transformer_engine/common/util/system.cpp b/transformer_engine/common/util/system.cpp index 263b301b7..fcc1ff531 100644 --- a/transformer_engine/common/util/system.cpp +++ b/transformer_engine/common/util/system.cpp @@ -91,6 +91,6 @@ extern "C" bool nvte_uses_fp8_fnuz() #if HIP_VERSION >= 60300000 return te_fp8_fnuz(); #endif - return true; // default to true for older versions that only support + return true; // default to true for older versions compatibility } #endif diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 4fa76b74d..dd9efa2ff 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -120,7 +120,7 @@ def _get_supported_versions(version_min, version_max): _flash_attn_is_installed = False _flash_attn_version = PkgVersion("0") _flash_attn_version_required = PkgVersion("2.1.1") -_flash_attn_max_version = PkgVersion("2.7.3") +_flash_attn_max_version = PkgVersion("2.7.4.post1") _flash_attn_2_plus = False _flash_attn_2_1_plus = False _flash_attn_2_3_plus = False