From 1f8ff70030ffd82e811917e2946773f9a589606b Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 31 Jan 2022 17:03:12 -0600 Subject: [PATCH 01/13] add docker file and make default target buildable --- Dockerfile | 90 +++++++++++++++++++++++++++++++++++++++++++++ host/CMakeLists.txt | 2 +- requirements.txt | 1 + 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 Dockerfile create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000000..597040b7a0c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,90 @@ +FROM ubuntu:18.04 + +ARG ROCMVERSION=4.5 +ARG OSDB_BKC_VERSION + +RUN set -xe + +ARG BUILD_THREADS=8 +ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ +# Add rocm repository +RUN apt-get update +RUN apt-get install -y wget gnupg +RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - +RUN if ! [ -z $OSDB_BKC_VERSION ]; then \ + echo "Using BKC VERISION: $OSDB_BKC_VERSION";\ + sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\ + cat /etc/apt/sources.list.d/rocm.list;\ + else \ + sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\ + fi +RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - +RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list" + +# ADD requirements.txt requirements.txt +# Install dependencies +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + sshpass \ + build-essential \ + cmake-data=3.15.1-0kitware1 \ + cmake=3.15.1-0kitware1 \ + curl \ + doxygen \ + g++ \ + gdb \ + git \ + hip-rocclr \ + jq \ + lcov \ + libelf-dev \ + libncurses5-dev \ + libnuma-dev \ + libpthread-stubs0-dev \ + llvm-amdgpu \ + miopengemm \ + pkg-config \ + python \ + python3 \ + python-dev \ + python3-dev \ + python-pip \ + python3-pip \ + software-properties-common \ + sqlite3 \ + wget \ + rocm-dev \ + rocm-device-libs \ + rocm-opencl \ + rocm-opencl-dev \ + rocm-cmake \ + rocblas \ + vim \ + zlib1g-dev \ + openssh-server \ + kmod \ + mysql-client && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# RUN pip3 install --default-timeout=100000 -r requirements.txt + +# Setup ubsan environment to printstacktrace +RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer +ENV UBSAN_OPTIONS=print_stacktrace=1 + +# Install an init system +RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb +RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb + +# Install cget +RUN pip install cget + +# Install rclone +RUN pip install https://github.com/pfultz2/rclone/archive/master.tar.gz + +ARG PREFIX=/opt/rocm +# Install dependencies +RUN cget install pfultz2/rocm-recipes +# RUN cget install -f min-requirements.txt +# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index 30cc14d8caf..e94ad6ba223 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -1,2 +1,2 @@ add_subdirectory(host_tensor) -add_subdirectory(driver_offline) +# add_subdirectory(driver_offline) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000000..0d93a7b69ee --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build From b824a5a62b29d06b22372f4692b896dab3148f9a Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 3 Feb 2022 16:50:39 -0600 Subject: [PATCH 02/13] add Jenkinsfile --- Jenkinsfile | 336 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 337 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 00000000000..44903d49a78 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,336 @@ +def rocmnode(name) { + return 'rocmtest && miopen && ' + name +} + +def show_node_info() { + sh """ + echo "NODE_NAME = \$NODE_NAME" + lsb_release -sd + uname -r + cat /sys/module/amdgpu/version + ls /opt/ -la + """ +} + +def cmake_build(Map conf=[:]){ + + def compiler = conf.get("compiler","/opt/rocm/llvm/bin/clang++") + def config_targets = conf.get("config_targets","check") + def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") + def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","") + def prefixpath = conf.get("prefixpath","/usr/local") + def mlir_args = " -DMIOPEN_USE_MLIR=" + conf.get("mlir_build", "ON") + def setup_args = mlir_args + " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","") + + if (prefixpath != "/usr/local"){ + setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " + } + + def build_type_debug = (conf.get("build_type",'release') == 'debug') + + //cmake_env can overwrite default CXX variables. + def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") + + def package_build = (conf.get("package_build","") == "true") + + if (package_build == true) { + config_targets = "package" + } + + if(conf.get("build_install","") == "true") + { + config_targets = 'install ' + config_targets + setup_args = ' -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install' + setup_args + } else{ + setup_args = ' -DBUILD_DEV=On' + setup_args + } + + // test_flags = ctest -> MIopen flags + def test_flags = conf.get("test_flags","") + + if (conf.get("vcache_enable","") == "true"){ + def vcache = conf.get(vcache_path,"/var/jenkins/.cache/miopen/vcache") + build_envs = " MIOPEN_VERIFY_CACHE_PATH='${vcache}' " + build_envs + } else{ + test_flags = " --disable-verification-cache " + test_flags + } + + if(conf.get("codecov", false)){ //Need + setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags} -fprofile-arcs -ftest-coverage' -DCODECOV_TEST=On " + setup_args + }else if(build_type_debug){ + setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args + }else{ + setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args + } + + if(test_flags != ""){ + setup_args = "-DMIOPEN_TEST_FLAGS='${test_flags}'" + setup_args + } + + if(conf.containsKey("find_mode")) + { + def fmode = conf.get("find_mode", "") + setup_args = " -DMIOPEN_DEFAULT_FIND_MODE=${fmode} " + setup_args + } + + def pre_setup_cmd = """ + echo \$HSA_ENABLE_SDMA + ulimit -c unlimited + rm -rf build + mkdir build + rm -rf install + mkdir install + rm -f src/kernels/*.ufdb.txt + rm -f src/kernels/miopen*.udb + cd build + """ + def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") + def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(nproc) ${config_targets}") + def execute_cmd = conf.get("execute_cmd", "") + + def cmd = conf.get("cmd", """ + ${pre_setup_cmd} + ${setup_cmd} + ${build_cmd} + ${execute_cmd} + """) + + echo cmd + sh cmd + + // Only archive from master or develop + if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) { + archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true + } +} + +def buildHipClangJob(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = "composable_kernels" + def prefixpath = conf.get("prefixpath", "/opt/rocm") + def gpu_arch = conf.get("gpu_arch", "gfx908") + + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1" + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' " + + def variant = env.STAGE_NAME + + + def retimage + gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + try { + retimage = docker.build("${image}", dockerArgs + '.') + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + catch(Exception ex) { + retimage = docker.build("${image}", dockerArgs + "--no-cache .") + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 5, unit: 'HOURS') + { + cmake_build(conf) + } + } + } + return retimage +} + +def reboot(){ + build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] +} + +def buildHipClangJobAndReboot(Map conf=[:]){ + try{ + buildHipClangJob(conf) + } + catch(e){ + echo "throwing error exception for the stage" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +pipeline { + agent none + options { + parallelsAlwaysFailFast() + } + environment{ + // variable = value + } + stages{ + stage("Static checks") { + parallel{ + stage('Tidy') { + agent{ label rocmnode("nogpu") } + environment{ + setup_cmd = "CXX='/opt/rocm/llvm/bin/clang++' cmake -DMIOPEN_BACKEND=HIP -DBUILD_DEV=On .. " + build_cmd = "make -j\$(nproc) -k analyze" + } + steps{ + buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, no_reboot:true) + } + } + stage('Clang Format') { + agent{ label rocmnode("nogpu") } + environment{ + execute_cmd = "find . -iname \'*.h\' \ + -o -iname \'*.hpp\' \ + -o -iname \'*.cpp\' \ + -o -iname \'*.h.in\' \ + -o -iname \'*.hpp.in\' \ + -o -iname \'*.cpp.in\' \ + -o -iname \'*.cl\' \ + | grep -v 'build/' \ + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'" + } + steps{ + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) + } + } + } + } + // stage("Full Tests") { + // when { + // expression { params.BUILD_FULL_TESTS } + // } + // environment{ + // WORKAROUND_iGemm_936 = " MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1=0" + // // WORKAROUND_ISSUE_1148: + // Navi21_build_cmd = "CTEST_PARALLEL_LEVEL=2 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check" + // } + // parallel{ + // stage('Fp32 Hip All gfx908') { + // when { + // beforeAgent true + // expression { params.TARGET_GFX908 && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("gfx908") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx908") + // } + // } + // stage('Fp32 Hip All gfx90a') { + // when { + // beforeAgent true + // expression { params.TARGET_GFX90A && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("gfx90a") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx90a:xnack-") + // } + // } + // stage('Fp32 Hip All gfx90a Xnack+') { + // when { + // beforeAgent true + // expression { params.TARGET_GFX90A && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("gfx90a") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx90a:xnack+", enforce_xnack_on: true) + // } + // } + // stage('Fp16 Hip Install All Vega20') { + // when { + // beforeAgent true + // expression { params.TARGET_VEGA20 && params.DATATYPE_FP16 } + // } + // agent{ label rocmnode("vega20") } + // steps{ + // buildHipClangJobAndReboot( setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true") + // } + // } + // stage('Fp32 Hip All Vega20') { + // when { + // beforeAgent true + // expression { params.TARGET_VEGA20 && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("vega20") } + // steps{ + // buildHipClangJobAndReboot( setup_flags: Full_test) + // } + // } + // stage('Fp32 OpenCL All gfx1030') { + // when { + // beforeAgent true + // expression { params.TARGET_NAVI21 && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("navi21") } + // steps{ + // buildHipClangJobAndReboot(compiler: 'g++', setup_flags: Full_test, build_cmd: Navi21_build_cmd, gpu_arch: "gfx1030") + // } + // } + // stage('Fp32 Hip All Install gfx1030') { + // when { + // beforeAgent true + // expression { params.TARGET_NAVI21 && params.DATATYPE_FP32 } + // } + // agent{ label rocmnode("navi21") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test, build_cmd: Navi21_build_cmd, build_install: "true", gpu_arch: "gfx1030") + // } + // } + // stage('Fp16 Hip All Install gfx908') { + // when { + // beforeAgent true + // expression { params.TARGET_GFX908 && params.DATATYPE_FP16 } + // } + // agent{ label rocmnode("gfx908") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true", gpu_arch: "gfx908") + // } + // } + // stage('Fp16 Hip All Install gfx90a') { + // when { + // beforeAgent true + // expression { params.TARGET_GFX90A && params.DATATYPE_FP16 } + // } + // agent{ label rocmnode("gfx90a") } + // steps{ + // buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true", gpu_arch: "gfx90a:xnack-") + // } + // } + // } + // } + + stage("Packages") { + when { + expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } + } + stage("HIP Package /opt/rocm") { + agent{ label rocmnode("nogpu") } + steps{ + buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") + } + } + } + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0d93a7b69ee..afc833cfcf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build +danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f From 91d41dec4b61447990d18bc8fb154cb7a8099d48 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 3 Feb 2022 17:00:44 -0600 Subject: [PATCH 03/13] remove empty env block --- Jenkinsfile | 111 ++-------------------------------------------------- 1 file changed, 4 insertions(+), 107 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 44903d49a78..9056c10c81f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -183,9 +183,9 @@ pipeline { options { parallelsAlwaysFailFast() } - environment{ - // variable = value - } + // environment{ + // variable = value + // } stages{ stage("Static checks") { parallel{ @@ -218,114 +218,11 @@ pipeline { } } } - // stage("Full Tests") { - // when { - // expression { params.BUILD_FULL_TESTS } - // } - // environment{ - // WORKAROUND_iGemm_936 = " MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1=0" - // // WORKAROUND_ISSUE_1148: - // Navi21_build_cmd = "CTEST_PARALLEL_LEVEL=2 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check" - // } - // parallel{ - // stage('Fp32 Hip All gfx908') { - // when { - // beforeAgent true - // expression { params.TARGET_GFX908 && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("gfx908") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx908") - // } - // } - // stage('Fp32 Hip All gfx90a') { - // when { - // beforeAgent true - // expression { params.TARGET_GFX90A && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("gfx90a") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx90a:xnack-") - // } - // } - // stage('Fp32 Hip All gfx90a Xnack+') { - // when { - // beforeAgent true - // expression { params.TARGET_GFX90A && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("gfx90a") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test, gpu_arch: "gfx90a:xnack+", enforce_xnack_on: true) - // } - // } - // stage('Fp16 Hip Install All Vega20') { - // when { - // beforeAgent true - // expression { params.TARGET_VEGA20 && params.DATATYPE_FP16 } - // } - // agent{ label rocmnode("vega20") } - // steps{ - // buildHipClangJobAndReboot( setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true") - // } - // } - // stage('Fp32 Hip All Vega20') { - // when { - // beforeAgent true - // expression { params.TARGET_VEGA20 && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("vega20") } - // steps{ - // buildHipClangJobAndReboot( setup_flags: Full_test) - // } - // } - // stage('Fp32 OpenCL All gfx1030') { - // when { - // beforeAgent true - // expression { params.TARGET_NAVI21 && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("navi21") } - // steps{ - // buildHipClangJobAndReboot(compiler: 'g++', setup_flags: Full_test, build_cmd: Navi21_build_cmd, gpu_arch: "gfx1030") - // } - // } - // stage('Fp32 Hip All Install gfx1030') { - // when { - // beforeAgent true - // expression { params.TARGET_NAVI21 && params.DATATYPE_FP32 } - // } - // agent{ label rocmnode("navi21") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test, build_cmd: Navi21_build_cmd, build_install: "true", gpu_arch: "gfx1030") - // } - // } - // stage('Fp16 Hip All Install gfx908') { - // when { - // beforeAgent true - // expression { params.TARGET_GFX908 && params.DATATYPE_FP16 } - // } - // agent{ label rocmnode("gfx908") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true", gpu_arch: "gfx908") - // } - // } - // stage('Fp16 Hip All Install gfx90a') { - // when { - // beforeAgent true - // expression { params.TARGET_GFX90A && params.DATATYPE_FP16 } - // } - // agent{ label rocmnode("gfx90a") } - // steps{ - // buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags, build_env: WORKAROUND_iGemm_936, build_install: "true", gpu_arch: "gfx90a:xnack-") - // } - // } - // } - // } - stage("Packages") { when { expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } } - stage("HIP Package /opt/rocm") { + stage("Package /opt/rocm") { agent{ label rocmnode("nogpu") } steps{ buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") From 1734cb6735d03658e3c2bcf06b31a17e11ec1a54 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 3 Feb 2022 17:04:52 -0600 Subject: [PATCH 04/13] fix package stage --- Jenkinsfile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9056c10c81f..42fc1389c3f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -222,10 +222,12 @@ pipeline { when { expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } } - stage("Package /opt/rocm") { - agent{ label rocmnode("nogpu") } - steps{ - buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") + parallel { + stage("Package /opt/rocm") { + agent{ label rocmnode("nogpu") } + steps{ + buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") + } } } } From 211708bf89f3f57af4851863ff7c128beb6c8015 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 3 Feb 2022 17:21:08 -0600 Subject: [PATCH 05/13] remove render group from docker run --- Jenkinsfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 42fc1389c3f..844057cecb2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -114,7 +114,9 @@ def buildHipClangJob(Map conf=[:]){ def prefixpath = conf.get("prefixpath", "/opt/rocm") def gpu_arch = conf.get("gpu_arch", "gfx908") - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + // Jenkins is complaining about the render group + // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1" } From a9eabe4b10384b2f299c797a652442ac01193c02 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Sun, 6 Feb 2022 16:29:51 -0600 Subject: [PATCH 06/13] clean up Jenkins file --- Jenkinsfile | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 844057cecb2..3068223e421 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,13 +14,12 @@ def show_node_info() { def cmake_build(Map conf=[:]){ - def compiler = conf.get("compiler","/opt/rocm/llvm/bin/clang++") + def compiler = conf.get("compiler","/opt/rocm/bin/hipcc") def config_targets = conf.get("config_targets","check") def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","") - def prefixpath = conf.get("prefixpath","/usr/local") - def mlir_args = " -DMIOPEN_USE_MLIR=" + conf.get("mlir_build", "ON") - def setup_args = mlir_args + " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","") + def prefixpath = conf.get("prefixpath","/opt/rocm") + def setup_args = " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","") if (prefixpath != "/usr/local"){ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " @@ -45,34 +44,12 @@ def cmake_build(Map conf=[:]){ setup_args = ' -DBUILD_DEV=On' + setup_args } - // test_flags = ctest -> MIopen flags - def test_flags = conf.get("test_flags","") - - if (conf.get("vcache_enable","") == "true"){ - def vcache = conf.get(vcache_path,"/var/jenkins/.cache/miopen/vcache") - build_envs = " MIOPEN_VERIFY_CACHE_PATH='${vcache}' " + build_envs - } else{ - test_flags = " --disable-verification-cache " + test_flags - } - - if(conf.get("codecov", false)){ //Need - setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags} -fprofile-arcs -ftest-coverage' -DCODECOV_TEST=On " + setup_args - }else if(build_type_debug){ + if(build_type_debug){ setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args }else{ setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args } - if(test_flags != ""){ - setup_args = "-DMIOPEN_TEST_FLAGS='${test_flags}'" + setup_args - } - - if(conf.containsKey("find_mode")) - { - def fmode = conf.get("find_mode", "") - setup_args = " -DMIOPEN_DEFAULT_FIND_MODE=${fmode} " + setup_args - } - def pre_setup_cmd = """ echo \$HSA_ENABLE_SDMA ulimit -c unlimited @@ -80,8 +57,6 @@ def cmake_build(Map conf=[:]){ mkdir build rm -rf install mkdir install - rm -f src/kernels/*.ufdb.txt - rm -f src/kernels/miopen*.udb cd build """ def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") @@ -194,11 +169,11 @@ pipeline { stage('Tidy') { agent{ label rocmnode("nogpu") } environment{ - setup_cmd = "CXX='/opt/rocm/llvm/bin/clang++' cmake -DMIOPEN_BACKEND=HIP -DBUILD_DEV=On .. " + // setup_cmd = "CXX='/opt/rocm/bin/hipcc' cmake -DBUILD_DEV=On .. " build_cmd = "make -j\$(nproc) -k analyze" } steps{ - buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, no_reboot:true) + buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug') } } stage('Clang Format') { From f6198353c10d4ae3e77becc41599ca790f13eb86 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Sun, 6 Feb 2022 17:05:10 -0600 Subject: [PATCH 07/13] add cppcheck as dev dependency --- Dockerfile | 11 +++++++++++ Jenkinsfile | 2 +- dev-requirements.txt | 3 +++ rbuild.ini | 8 ++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 dev-requirements.txt create mode 100644 rbuild.ini diff --git a/Dockerfile b/Dockerfile index 597040b7a0c..61aebd1cce5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -86,5 +86,16 @@ RUN pip install https://github.com/pfultz2/rclone/archive/master.tar.gz ARG PREFIX=/opt/rocm # Install dependencies RUN cget install pfultz2/rocm-recipes +# Install rbuild +RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz +# Setup ubsan environment to printstacktrace +ENV UBSAN_OPTIONS=print_stacktrace=1 + +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ADD rbuild.ini /rbuild.ini +ADD dev-requirements.txt dev-requirements.txt +RUN rbuild prepare -s develop -d $PREFIX +RUN groupadd -f render # RUN cget install -f min-requirements.txt # RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile index 3068223e421..3e5f5503848 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,7 +19,7 @@ def cmake_build(Map conf=[:]){ def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","") def prefixpath = conf.get("prefixpath","/opt/rocm") - def setup_args = " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","") + def setup_args = conf.get("setup_flags","") if (prefixpath != "/usr/local"){ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 00000000000..5d123edb856 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,3 @@ +ROCmSoftwarePlatform/rocm-recipes +# 1.90+ +danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f \ No newline at end of file diff --git a/rbuild.ini b/rbuild.ini new file mode 100644 index 00000000000..2ab625c4114 --- /dev/null +++ b/rbuild.ini @@ -0,0 +1,8 @@ +[develop] +cxx = ${rocm_path}/bin/hipcc +cc = ${rocm_path}/llvm/bin/clang +ignore = pcre +deps = + -f dev-requirements.txt +define = + BUILD_DEV=On \ No newline at end of file From 659f9207c95b10d0bd827c090c1d1aba857aef8c Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Thu, 10 Feb 2022 11:01:29 -0600 Subject: [PATCH 08/13] update cmake file --- CMakeLists.txt | 51 ++++++++ cmake/TargetFlags.cmake | 50 ++++++++ composable_kernel/include/config.hpp | 172 +++++++++++++++++++++++++++ 3 files changed, 273 insertions(+) create mode 100644 cmake/TargetFlags.cmake create mode 100644 composable_kernel/include/config.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index cb0508fec5c..c58ae21caae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,25 @@ cmake_minimum_required(VERSION 3.5) + +# Check support for CUDA/HIP in Cmake project(composable_kernel) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") +enable_testing() + +find_package(ROCM REQUIRED PATHS /opt/rocm) + +include(ROCMInstallTargets) +include(ROCMPackageConfigHelpers) +include(ROCMSetupVersion) +include(ROCMInstallSymlinks) +include(ROCMCreatePackage) include(CheckCXXCompilerFlag) +rocm_setup_version(VERSION 1.0.0) +include(TargetFlags) +list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) + ## C++ enable_language(CXX) set(CMAKE_CXX_STANDARD 17) @@ -50,6 +65,12 @@ if(BUILD_DEV) endif() message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +rocm_create_package( + NAME MIOpen-${MIOPEN_BACKEND} + DESCRIPTION "High Performance Composable Kernels for AMD GPUs" + MAINTAINER "MIOpen Kernel Dev Team " + LDCONFIG +) ## tidy include(EnableCompilerWarnings) set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) @@ -60,6 +81,7 @@ elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") set(MIOPEN_TIDY_ERRORS ALL) endif() + include(ClangTidy) enable_clang_tidy( CHECKS @@ -196,6 +218,35 @@ enable_cppcheck( CPPCHECK=1 __linux__=1 ) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) + +file(GLOB_RECURSE COMPOSABLE_KERNEL_HEADERS "composable_kernel/include/*/*.hpp") +file(GLOB_RECURSE DEVICE_OPS_HEADERS "device_operation/include/*/*.cpp") + +file(GLOB_RECURSE DEVICE_OPS_SOURCE "device_operation/*.cpp") +file(GLOB_RECURSE COMPOSABLE_KERNEL_WRAPPER_SOURCE "composable_kernel/src/kernel_wrapper/*.cpp") + +set(CK_HEADERS ${COMPOSABLE_KERNEL_HEADERS} ${DEVICE_OPS_HEADERS}) +set(CK_SOURCE ${DEVICE_OPS_SOURCE} ${COMPOSABLE_KERNEL_WRAPPER_SOURCE}) +add_library( composable_kernel + ${CK_SOURCE} + ) + +target_include_directories(composable_kernel PUBLIC + $ +) +target_include_directories(composable_kernel PUBLIC + $ +) +target_include_directories(composable_kernel PUBLIC + $ +) +target_include_directories(composable_kernel PUBLIC + $ +) +# clang_tidy_check(composable_kernel) add_subdirectory(host) add_subdirectory(example) diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake new file mode 100644 index 00000000000..4f83fb5d396 --- /dev/null +++ b/cmake/TargetFlags.cmake @@ -0,0 +1,50 @@ + +function(get_target_property2 VAR TARGET PROPERTY) + get_target_property(_pflags ${TARGET} ${PROPERTY}) + if(_pflags) + set(${VAR} ${_pflags} PARENT_SCOPE) + else() + set(${VAR} "" PARENT_SCOPE) + endif() +endfunction() + + +macro(append_flags FLAGS TARGET PROPERTY PREFIX) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + else() + string(APPEND ${FLAGS} " ${PREFIX}${FLAG}") + endif() + endforeach() +endmacro() + +macro(append_link_flags FLAGS TARGET PROPERTY) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + elseif(FLAG MATCHES "^-.*") + string(APPEND ${FLAGS} " ${FLAG}") + elseif(EXISTS ${FLAG}) + string(APPEND ${FLAGS} " ${FLAG}") + else() + string(APPEND ${FLAGS} " -l${FLAG}") + endif() + endforeach() +endmacro() + +function(target_flags FLAGS TARGET) + set(_flags) + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "") + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D") + append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "") + append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "") + # message("_flags: ${_flags}") + set(${FLAGS} ${_flags} PARENT_SCOPE) +endfunction() diff --git a/composable_kernel/include/config.hpp b/composable_kernel/include/config.hpp new file mode 100644 index 00000000000..f29ab546605 --- /dev/null +++ b/composable_kernel/include/config.hpp @@ -0,0 +1,172 @@ +#ifndef CK_CONFIG_AMD_HPP +#define CK_CONFIG_AMD_HPP + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include "hip/hip_runtime.h" +#include "hip/hip_fp16.h" +#endif + +// "Constant" address space for kernel parameter +#define CONSTANT __attribute__((address_space(4))) + +// GPU target +// should enable one and only one GPU target +#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ + defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030)) +#error Need to define (only) one GPU target +#endif + +// launch bounds +#define CK_USE_LAUNCH_BOUNDS 1 + +#ifdef CK_USE_LAUNCH_BOUNDS +#define CK_MAX_THREAD_PER_BLOCK 256 +#define CK_MIN_BLOCK_PER_CU 2 +#endif + +// GPU-specific parameters +#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ + defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) +// buffer resourse +#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 +// wave size +#define CK_GPU_WAVE_SIZE 64 +#elif defined(CK_AMD_GPU_GFX1030) +#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 +#define CK_GPU_WAVE_SIZE 32 +#endif + +// FMA instruction +#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) +#define CK_USE_AMD_V_MAC_F32 +#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \ + defined(CK_AMD_GPU_GFX1030) +#define CK_USE_AMD_V_FMAC_F32 +#define CK_USE_AMD_V_DOT2_F32_F16 +#define CK_USE_AMD_V_DOT4_I32_I8 +#endif + +// multi index +#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 + +// AMD inline asm +#ifndef CK_USE_AMD_INLINE_ASM +#define CK_USE_AMD_INLINE_ASM 1 +#endif + +// AMD inner product (DLOP) +#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM +#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 +#endif + +// AMD buffer addressing +#ifndef CK_USE_AMD_BUFFER_ADDRESSING +#define CK_USE_AMD_BUFFER_ADDRESSING 1 +#endif + +// only gfx908 support native floating point atomic add +#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD +#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0 +#endif + +// AMD XDLOPS +#ifndef CK_USE_AMD_XDLOPS +#define CK_USE_AMD_XDLOPS 0 +#endif + +// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) +#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM +#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 +#endif + +// experimental implementation for buffer load/store/atomic +#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 +#endif + +#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK +#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 +#endif + +#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK +#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 +#endif + +// experimental implementation for in-regsiter sub-dword transpose +#ifndef CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE +#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1 +#endif + +// pass tensor descriptor by value or void* +#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 +#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 +#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 + +// merge transformation use magic number division +#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION +#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 +#endif + +// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar +#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS +#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0 +#endif + +// use __builtin_memcpy instead of union to do bit_cast +#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST +#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 +#endif + +// hack: have underlying assumption that need to be satsified, otherwise it's a bug +// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be +// thread-invariant, otherwise it's a bug +// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" +#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE +#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 +#endif + +// workaround for compiler crash when compiling recursive lambda +#ifndef CK_WORKAROUND_SWDEV_275126 +#define CK_WORKAROUND_SWDEV_275126 1 +#endif + +// workaround for compiler crash when using buffer load/store for i8 +#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE +#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1 +#endif + +// workaround for compiler gnerating inefficient ds_write instructions +#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE +#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1 +#endif + +// workaround for register spill due to compiler issue, when casting type between fp32 and fp16 +#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE +#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1 +#endif + +#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE +#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1 +#endif + +namespace ck { + +enum InMemoryDataOperationEnum_t +{ + Set, + AtomicAdd, + Add +}; + +enum ActivTypeEnum_t +{ + None, + LeakyRelu, + Sigmoid +}; + +// index type +using index_t = int32_t; + +} // namespace ck +#endif From 75f9b7d3c26c4ef69da9ba1d8e2187ba5abe15bc Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 11 Feb 2022 12:28:07 -0600 Subject: [PATCH 09/13] Add profiler build stage --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++------- Jenkinsfile | 22 ++++++++++++++++----- profiler/CMakeLists.txt | 1 + 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c58ae21caae..e73bb528131 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,12 +45,30 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}") message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}") message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") link_libraries(${OpenMP_gomp_LIBRARY}) link_libraries(${OpenMP_pthread_LIBRARY}) ## HIP find_package(HIP REQUIRED) +# Override HIP version in config.h, if necessary. +# The variables set by find_package() can't be overwritten, +# therefore let's use intermediate variables. +set(CK_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}") +set(CK_hip_VERSION_MINOR "${hip_VERSION_MINOR}") +set(CK_hip_VERSION_PATCH "${hip_VERSION_PATCH}") +if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR ) + set(CK_hip_VERSION_MAJOR "${MIOPEN_OVERRIDE_HIP_VERSION_MAJOR}") + message(STATUS "CK_hip_VERSION_MAJOR overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_MAJOR}") +endif() +if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR ) + set(CK_hip_VERSION_MINOR "${MIOPEN_OVERRIDE_HIP_VERSION_MINOR}") + message(STATUS "CK_hip_VERSION_MINOR overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_MINOR}") +endif() +if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) + set(CK_hip_VERSION_PATCH "${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}") + message(STATUS "CK_hip_VERSION_PATCH overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}") +endif() message(STATUS "Build with HIP ${hip_VERSION}") ## half @@ -58,12 +76,6 @@ message(STATUS "Build with HIP ${hip_VERSION}") set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/half/include") message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}") -# CMAKE_CXX_FLAGS -SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") -if(BUILD_DEV) - string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything") -endif() -message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") rocm_create_package( NAME MIOpen-${MIOPEN_BACKEND} @@ -246,7 +258,25 @@ target_include_directories(composable_kernel PUBLIC target_include_directories(composable_kernel PUBLIC $ ) +# The following should eventually be removed +target_include_directories(composable_kernel PUBLIC + $ +) +target_include_directories(composable_kernel PUBLIC + $ +) +target_include_directories(composable_kernel PUBLIC + $ +) # clang_tidy_check(composable_kernel) +SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") +if(BUILD_DEV) + target_compile_options(composable_kernel PRIVATE -Werror) + target_compile_options(composable_kernel PRIVATE -Weverything) +endif() +message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + +configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/hip_version.hpp") add_subdirectory(host) add_subdirectory(example) diff --git a/Jenkinsfile b/Jenkinsfile index 3e5f5503848..e32064c9486 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -166,14 +166,26 @@ pipeline { stages{ stage("Static checks") { parallel{ - stage('Tidy') { - agent{ label rocmnode("nogpu") } + // enable after we move from hipcc to hip-clang + // stage('Tidy') { + // agent{ label rocmnode("nogpu") } + // environment{ + // // setup_cmd = "CXX='/opt/rocm/bin/hipcc' cmake -DBUILD_DEV=On .. " + // build_cmd = "make -j\$(nproc) -k analyze" + // } + // steps{ + // buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug') + // } + // } + stage('Build Profiler: gfx908') + { + agent { label rocmnode("gfx908")} environment{ - // setup_cmd = "CXX='/opt/rocm/bin/hipcc' cmake -DBUILD_DEV=On .. " - build_cmd = "make -j\$(nproc) -k analyze" + setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """ + build_cmd = "make -j\$(nproc) -k ckProfiler" } steps{ - buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug') + buildHipClangJobAndReboot(setup_args:setup_args, build_cmd:build_cmd, no_reboot:true, build_type: 'debug') } } stage('Clang Format') { diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index 6ef9cd60146..f362c62e28f 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -55,6 +55,7 @@ set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_I install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) # device_conv2d_fwd_bias_relu_add_instance +# Missing file "blockwise_tensor_slice_transfer.hpp" set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE ${PROJECT_SOURCE_DIR}/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp; ) From 8148c4382a1487b85ce92073e0ecd534a31f00f0 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 11 Feb 2022 15:24:56 -0600 Subject: [PATCH 10/13] add hip_version config file for reduction operator --- composable_kernel/include/hip_version.hpp.in | 29 ++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 composable_kernel/include/hip_version.hpp.in diff --git a/composable_kernel/include/hip_version.hpp.in b/composable_kernel/include/hip_version.hpp.in new file mode 100644 index 00000000000..840ae924166 --- /dev/null +++ b/composable_kernel/include/hip_version.hpp.in @@ -0,0 +1,29 @@ +#pragma once + + +// "_PACKAGE_" to avoid name contentions: the macros like +// HIP_VERSION_MAJOR are defined in hip_version.h. +// clang-format off +#define HIP_PACKAGE_VERSION_MAJOR @CK_hip_VERSION_MAJOR@ +#define HIP_PACKAGE_VERSION_MINOR @CK_hip_VERSION_MINOR@ +#define HIP_PACKAGE_VERSION_PATCH @CK_hip_VERSION_PATCH@ +// clang-format on + +#ifndef HIP_PACKAGE_VERSION_MAJOR +#define HIP_PACKAGE_VERSION_MAJOR 0 +#endif +#ifndef HIP_PACKAGE_VERSION_MINOR +#define HIP_PACKAGE_VERSION_MINOR 0 +#endif +#ifndef HIP_PACKAGE_VERSION_PATCH +#define HIP_PACKAGE_VERSION_PATCH 0 +#endif +// 3 decimal digits for major and minor, 6 digits for patch number. +// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math. +#if HIP_PACKAGE_VERSION_MAJOR > 999 || HIP_PACKAGE_VERSION_MAJOR > 999 || \ + HIP_PACKAGE_VERSION_PATCH > 999999 +#error "Too big HIP version number(s)" +#endif +#define HIP_PACKAGE_VERSION_FLAT \ + ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \ + HIP_PACKAGE_VERSION_PATCH) \ No newline at end of file From 9e3c055070ae535a87d50c08bc4e89feab07d97d Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 11 Feb 2022 15:52:49 -0600 Subject: [PATCH 11/13] correct jenkins var name --- Jenkinsfile | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e32064c9486..9863ed22eab 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,7 +19,7 @@ def cmake_build(Map conf=[:]){ def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","") def prefixpath = conf.get("prefixpath","/opt/rocm") - def setup_args = conf.get("setup_flags","") + def setup_args = conf.get("setup_args","") if (prefixpath != "/usr/local"){ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " @@ -207,18 +207,19 @@ pipeline { } } } - stage("Packages") { - when { - expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } - } - parallel { - stage("Package /opt/rocm") { - agent{ label rocmnode("nogpu") } - steps{ - buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") - } - } - } - } + // enable after the cmake file supports packaging + // stage("Packages") { + // when { + // expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } + // } + // parallel { + // stage("Package /opt/rocm") { + // agent{ label rocmnode("nogpu") } + // steps{ + // buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") + // } + // } + // } + // } } } \ No newline at end of file From cb754aa92d60acf0ea436d923072b0de3891056d Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 11 Feb 2022 17:30:50 -0600 Subject: [PATCH 12/13] Build release instead of debug --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9863ed22eab..f7f029ce90f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -185,7 +185,7 @@ pipeline { build_cmd = "make -j\$(nproc) -k ckProfiler" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, build_cmd:build_cmd, no_reboot:true, build_type: 'debug') + buildHipClangJobAndReboot(setup_args:setup_args, build_cmd:build_cmd, no_reboot:true, build_type: 'Release') } } stage('Clang Format') { From 7369aae30d9dcc8598e1633cdb4a5919933820c3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 19 Feb 2022 03:42:03 +0000 Subject: [PATCH 13/13] clean up --- CMakeLists.txt | 46 +++-- composable_kernel/include/config.hpp | 2 +- composable_kernel/include/hip_version.hpp.in | 31 ++-- composable_kernel/include/utility/config.hpp | 172 ------------------- 4 files changed, 38 insertions(+), 213 deletions(-) delete mode 100644 composable_kernel/include/utility/config.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9867061eec2..021f5caf065 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,22 +54,22 @@ find_package(HIP REQUIRED) # Override HIP version in config.h, if necessary. # The variables set by find_package() can't be overwritten, # therefore let's use intermediate variables. -set(CK_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}") -set(CK_hip_VERSION_MINOR "${hip_VERSION_MINOR}") -set(CK_hip_VERSION_PATCH "${hip_VERSION_PATCH}") +set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}") +set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}") +set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}") if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR ) - set(CK_hip_VERSION_MAJOR "${MIOPEN_OVERRIDE_HIP_VERSION_MAJOR}") - message(STATUS "CK_hip_VERSION_MAJOR overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_MAJOR}") + set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}") + message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}") endif() if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR ) - set(CK_hip_VERSION_MINOR "${MIOPEN_OVERRIDE_HIP_VERSION_MINOR}") - message(STATUS "CK_hip_VERSION_MINOR overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_MINOR}") + set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}") + message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}") endif() if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) - set(CK_hip_VERSION_PATCH "${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}") - message(STATUS "CK_hip_VERSION_PATCH overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}") + set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}") + message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}") endif() -message(STATUS "Build with HIP ${hip_VERSION}") +message(STATUS "Build with HIP ${HIP_VERSION}") ## half #find_path(HALF_INCLUDE_DIR half.hpp) @@ -78,19 +78,18 @@ message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}") rocm_create_package( - NAME MIOpen-${MIOPEN_BACKEND} + NAME CK-${CK_BACKEND} DESCRIPTION "High Performance Composable Kernels for AMD GPUs" - MAINTAINER "MIOpen Kernel Dev Team " LDCONFIG ) ## tidy include(EnableCompilerWarnings) -set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) +set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+") - set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter) + set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter) # Enable tidy on hip -elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") - set(MIOPEN_TIDY_ERRORS ALL) +elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU") + set(CK_TIDY_ERRORS ALL) endif() @@ -186,12 +185,12 @@ enable_clang_tidy( -altera-struct-pack-align -cppcoreguidelines-prefer-member-initializer - ${MIOPEN_TIDY_CHECKS} - ${MIOPEN_TIDY_ERRORS} + ${CK_TIDY_CHECKS} + ${CK_TIDY_ERRORS} HEADER_FILTER "\.hpp$" EXTRA_ARGS - -DMIOPEN_USE_CLANG_TIDY + -DCK_USE_CLANG_TIDY ) include(CppCheck) @@ -235,16 +234,15 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) file(GLOB_RECURSE COMPOSABLE_KERNEL_HEADERS "composable_kernel/include/*/*.hpp") -file(GLOB_RECURSE DEVICE_OPS_HEADERS "device_operation/include/*/*.cpp") +file(GLOB_RECURSE DEVICE_OPS_HEADERS "device_operation/include/*.hpp") file(GLOB_RECURSE DEVICE_OPS_SOURCE "device_operation/*.cpp") -file(GLOB_RECURSE COMPOSABLE_KERNEL_WRAPPER_SOURCE "composable_kernel/src/kernel_wrapper/*.cpp") set(CK_HEADERS ${COMPOSABLE_KERNEL_HEADERS} ${DEVICE_OPS_HEADERS}) -set(CK_SOURCE ${DEVICE_OPS_SOURCE} ${COMPOSABLE_KERNEL_WRAPPER_SOURCE}) -add_library( composable_kernel +set(CK_SOURCE ${DEVICE_OPS_SOURCE}) +add_library(composable_kernel ${CK_SOURCE} - ) +) target_include_directories(composable_kernel PUBLIC $ diff --git a/composable_kernel/include/config.hpp b/composable_kernel/include/config.hpp index f29ab546605..bb6ba58e6a1 100644 --- a/composable_kernel/include/config.hpp +++ b/composable_kernel/include/config.hpp @@ -1,7 +1,7 @@ #ifndef CK_CONFIG_AMD_HPP #define CK_CONFIG_AMD_HPP -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" #endif diff --git a/composable_kernel/include/hip_version.hpp.in b/composable_kernel/include/hip_version.hpp.in index 840ae924166..4290ef7e0dc 100644 --- a/composable_kernel/include/hip_version.hpp.in +++ b/composable_kernel/include/hip_version.hpp.in @@ -1,29 +1,28 @@ #pragma once - // "_PACKAGE_" to avoid name contentions: the macros like -// HIP_VERSION_MAJOR are defined in hip_version.h. +// HIP_VERSION_MAJOR are defined in HIP_VERSION.h. // clang-format off -#define HIP_PACKAGE_VERSION_MAJOR @CK_hip_VERSION_MAJOR@ -#define HIP_PACKAGE_VERSION_MINOR @CK_hip_VERSION_MINOR@ -#define HIP_PACKAGE_VERSION_PATCH @CK_hip_VERSION_PATCH@ +#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@ +#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@ +#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@ // clang-format on -#ifndef HIP_PACKAGE_VERSION_MAJOR -#define HIP_PACKAGE_VERSION_MAJOR 0 +#ifndef CK_HIP_PACKAGE_VERSION_MAJOR +#define CK_HIP_PACKAGE_VERSION_MAJOR 0 #endif -#ifndef HIP_PACKAGE_VERSION_MINOR -#define HIP_PACKAGE_VERSION_MINOR 0 +#ifndef CK_HIP_PACKAGE_VERSION_MINOR +#define CK_HIP_PACKAGE_VERSION_MINOR 0 #endif -#ifndef HIP_PACKAGE_VERSION_PATCH -#define HIP_PACKAGE_VERSION_PATCH 0 +#ifndef CK_HIP_PACKAGE_VERSION_PATCH +#define CK_HIP_PACKAGE_VERSION_PATCH 0 #endif // 3 decimal digits for major and minor, 6 digits for patch number. // Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math. -#if HIP_PACKAGE_VERSION_MAJOR > 999 || HIP_PACKAGE_VERSION_MAJOR > 999 || \ - HIP_PACKAGE_VERSION_PATCH > 999999 +#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \ + CK_HIP_PACKAGE_VERSION_PATCH > 999999 #error "Too big HIP version number(s)" #endif -#define HIP_PACKAGE_VERSION_FLAT \ - ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \ - HIP_PACKAGE_VERSION_PATCH) \ No newline at end of file +#define CK_HIP_PACKAGE_VERSION_FLAT \ + ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \ + CK_HIP_PACKAGE_VERSION_PATCH) diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp deleted file mode 100644 index f29ab546605..00000000000 --- a/composable_kernel/include/utility/config.hpp +++ /dev/null @@ -1,172 +0,0 @@ -#ifndef CK_CONFIG_AMD_HPP -#define CK_CONFIG_AMD_HPP - -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include "hip/hip_runtime.h" -#include "hip/hip_fp16.h" -#endif - -// "Constant" address space for kernel parameter -#define CONSTANT __attribute__((address_space(4))) - -// GPU target -// should enable one and only one GPU target -#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030)) -#error Need to define (only) one GPU target -#endif - -// launch bounds -#define CK_USE_LAUNCH_BOUNDS 1 - -#ifdef CK_USE_LAUNCH_BOUNDS -#define CK_MAX_THREAD_PER_BLOCK 256 -#define CK_MIN_BLOCK_PER_CU 2 -#endif - -// GPU-specific parameters -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) -// buffer resourse -#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 -// wave size -#define CK_GPU_WAVE_SIZE 64 -#elif defined(CK_AMD_GPU_GFX1030) -#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 -#define CK_GPU_WAVE_SIZE 32 -#endif - -// FMA instruction -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) -#define CK_USE_AMD_V_MAC_F32 -#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \ - defined(CK_AMD_GPU_GFX1030) -#define CK_USE_AMD_V_FMAC_F32 -#define CK_USE_AMD_V_DOT2_F32_F16 -#define CK_USE_AMD_V_DOT4_I32_I8 -#endif - -// multi index -#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 - -// AMD inline asm -#ifndef CK_USE_AMD_INLINE_ASM -#define CK_USE_AMD_INLINE_ASM 1 -#endif - -// AMD inner product (DLOP) -#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM -#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 -#endif - -// AMD buffer addressing -#ifndef CK_USE_AMD_BUFFER_ADDRESSING -#define CK_USE_AMD_BUFFER_ADDRESSING 1 -#endif - -// only gfx908 support native floating point atomic add -#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD -#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0 -#endif - -// AMD XDLOPS -#ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 0 -#endif - -// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) -#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM -#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 -#endif - -// experimental implementation for buffer load/store/atomic -#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 -#endif - -// experimental implementation for in-regsiter sub-dword transpose -#ifndef CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE -#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1 -#endif - -// pass tensor descriptor by value or void* -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 -#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 - -// merge transformation use magic number division -#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION -#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 -#endif - -// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar -#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS -#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0 -#endif - -// use __builtin_memcpy instead of union to do bit_cast -#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST -#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 -#endif - -// hack: have underlying assumption that need to be satsified, otherwise it's a bug -// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be -// thread-invariant, otherwise it's a bug -// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" -#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE -#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 -#endif - -// workaround for compiler crash when compiling recursive lambda -#ifndef CK_WORKAROUND_SWDEV_275126 -#define CK_WORKAROUND_SWDEV_275126 1 -#endif - -// workaround for compiler crash when using buffer load/store for i8 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1 -#endif - -// workaround for compiler gnerating inefficient ds_write instructions -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1 -#endif - -// workaround for register spill due to compiler issue, when casting type between fp32 and fp16 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1 -#endif - -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1 -#endif - -namespace ck { - -enum InMemoryDataOperationEnum_t -{ - Set, - AtomicAdd, - Add -}; - -enum ActivTypeEnum_t -{ - None, - LeakyRelu, - Sigmoid -}; - -// index type -using index_t = int32_t; - -} // namespace ck -#endif