diff --git a/.github/workflows/README_BUILD_PACKAGES.md b/.github/workflows/README_BUILD_PACKAGES.md new file mode 100644 index 00000000..2490f1b7 --- /dev/null +++ b/.github/workflows/README_BUILD_PACKAGES.md @@ -0,0 +1,226 @@ +# Building Relocatable TransferBench Packages with GitHub Actions + +This document describes the GitHub Actions workflow for building relocatable +TransferBench packages using the ROCm SDK from +[TheRock](https://github.com/ROCm/TheRock). + +The workflow (`.github/workflows/build-relocatable-packages.yml`) and the +`build_packages_local.sh` script at the repo root produce: + +- **DEB** packages for Ubuntu/Debian +- **RPM** packages for AlmaLinux/Rocky/RHEL (built in `manylinux_2_28`) +- **TGZ** archives for any Linux distribution + +All packages install to `/opt/rocm/extras-` and use relocatable +`$ORIGIN`-relative `RPATH` so the install tree itself can be moved without +hard-coded library paths. These artifacts are **not** fully self-contained: +target systems must still provide the required ROCm/HSA runtime libraries +(declared as package dependencies: `hsa-rocr` and `numactl`). + +This workflow is modeled on the +[ROCmValidationSuite packaging workflow](https://github.com/ROCm/ROCmValidationSuite/blob/master/.github/workflows/README_BUILD_PACKAGES.md). + +## Workflow Triggers + +| Trigger | Behavior | +|---------|----------| +| Push to `develop`, `mainline`, `release/**` | Build + upload to S3 (if configured) + regenerate apt/yum repo metadata | +| Pull request to `develop`, `mainline` | Build + upload to ref-specific S3 path (no repo metadata) | +| Schedule (daily 13:00 UTC) | Same as push, with auto-fetched latest ROCm | +| `workflow_dispatch` | Manual trigger with `rocm_version` and `gpu_family` inputs | + +### Manual trigger inputs + +- **`rocm_version`** (e.g. `7.11.0a20260121`). Empty = auto-fetch latest from TheRock. +- **`gpu_family`** — one of: + - `gfx94X-dcgpu` (MI300A/MI300X) — **default** + - `gfx950-dcgpu` (MI350X/MI355X) + - `gfx110X-all` (RX 7900 XTX, 7800 XT, 7700S, Radeon 780M) + - `gfx120X-all` (RX 9060/XT, 9070/XT) + - `gfx1151` (Strix Halo iGPU) + +## Build features enabled in CI + +The workflow always builds with: + +- `ENABLE_NIC_EXEC=OFF` — RDMA NIC executor disabled (would require libibverbs.so.1 at runtime; not bundled by TheRock SDK) +- `ENABLE_MPI_COMM=OFF` — MPI multi-node communicator disabled (would require OpenMPI at runtime; not bundled by TheRock SDK). Packages are built to run out of the box with only `numactl`/`libnuma1` from the OS. +- `DISABLE_DMABUF=OFF` — DMA-BUF support for GPU Direct RDMA +- `BUILD_RELOCATABLE_PACKAGE=ON` — RVS-style install prefix + package naming +- `GPU_TARGETS` — full data-center + consumer set (gfx906, 908, 90a, 942, 950, 1030, 1100/01/02, 1150/51, 1200/01) + +## Local builds + +The same script the workflow uses also works locally: + +```bash +# Auto-fetch latest ROCm +sudo ./build_packages_local.sh + +# Pin a specific version (use sudo -E to preserve env) +sudo -E ROCM_VERSION=7.11.0a20260121 GPU_FAMILY=gfx94X-dcgpu ./build_packages_local.sh + +# Debug build +sudo -E BUILD_TYPE=Debug ./build_packages_local.sh +``` + +`sudo` is required because the script installs system packages +(`libnuma-dev`, `libibverbs-dev`, `libopenmpi-dev`, etc). + +After the script completes, packages live under `build/`: + +``` +build/amdrocm7-transferbench_1.66.02-_amd64.deb +build/amdrocm7-transferbench-1.66.02-.x86_64.rpm +build/amdrocm7-transferbench-1.66.02-Linux.tar.gz +``` + +## Installing built packages + +### Ubuntu / Debian + +```bash +sudo dpkg -i build/amdrocm7-transferbench_*.deb +/opt/rocm/extras-7/bin/TransferBench +``` + +### Rocky / RHEL / AlmaLinux + +```bash +sudo rpm -i --replacefiles --nodeps build/amdrocm7-transferbench-*.rpm +/opt/rocm/extras-7/bin/TransferBench +``` + +### Any Linux (TGZ — relocatable install tree, requires ROCm runtime on target) + +```bash +sudo mkdir -p /opt/rocm/extras-7 +sudo tar -xzf build/amdrocm7-transferbench-*.tar.gz -C /opt/rocm/extras-7 --strip-components=1 +export PATH=/opt/rocm/extras-7/bin:$PATH +TransferBench +``` + +## S3 upload (OIDC) + +S3 upload runs only when: +- The repository is `ROCm/TransferBench`, **and** +- The `AWS_S3_BUCKET` repository variable is set. + +Upload uses **AWS OIDC** — no long-term keys are stored in the repo. + +### S3 path layout + +| Trigger | Path | +|---------|------| +| `release/*` push or dispatch | `release/transferbench/{deb,rpm,tar}/` | +| Schedule, push to `develop`/`mainline`, dispatch on non-release | `nightly/transferbench/{deb,rpm,tar}/` | +| Pull request (same repo) | `transferbench///{ubuntu-22.04,manylinux_2_28}/` | + +### Required repository setup + +In **Settings → Secrets and variables → Actions**: + +**Secrets tab:** +- `AWS_ROLE_ARN` — IAM role ARN with OIDC trust for this repo (e.g. `arn:aws:iam::123456789012:role/rocm-transferbench-s3-upload`) + +**Variables tab:** +- `AWS_S3_BUCKET` — bucket name (e.g. `rocm-transferbench-packages`) +- `RUNNER_LABEL` (optional) — override Ubuntu runner label (default `ubuntu-22.04`) +- `RUNNER_LABEL_CONTAINER` (optional) — override container-job runner label (default `ubuntu-latest`) +- `RUNNER_LABEL_UTILITY` (optional) — override summary-job runner label (default `ubuntu-latest`) + +### IAM role trust policy + +The role in `AWS_ROLE_ARN` must trust GitHub's OIDC provider: + +```json +{ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam:::oidc-provider/token.actions.githubusercontent.com" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { "token.actions.githubusercontent.com:aud": "sts.amazonaws.com" }, + "StringLike": { "token.actions.githubusercontent.com:sub": "repo:ROCm/TransferBench:*" } + } + }] +} +``` + +Permissions needed: `s3:PutObject`, `s3:GetObject`, `s3:ListBucket`, `s3:DeleteObject` on the bucket. + +## Using the S3 paths as apt / yum repos + +Push and scheduled builds also publish APT / YUM metadata so the S3 paths +work directly as native package repositories. + +### apt (Ubuntu / Debian) + +```bash +echo "deb [trusted=yes] https://.s3.amazonaws.com/nightly/transferbench/deb/ ./" \ + | sudo tee /etc/apt/sources.list.d/transferbench-nightly.list +sudo apt update +sudo apt install amdrocm7-transferbench +``` + +### yum / dnf (Rocky / RHEL / AlmaLinux) + +```bash +sudo tee /etc/yum.repos.d/transferbench-nightly.repo <<'EOF' +[transferbench-nightly] +name=TransferBench Nightly +baseurl=https://.s3.amazonaws.com/nightly/transferbench/rpm/ +enabled=1 +gpgcheck=0 +EOF +sudo dnf install amdrocm7-transferbench +``` + +> **Note:** `[trusted=yes]` / `gpgcheck=0` skip GPG verification. For +> production deployments, sign packages and metadata with a GPG key. + +## Verifying RPATH + +```bash +readelf -d /opt/rocm/extras-7/bin/TransferBench | grep -E 'RPATH|RUNPATH' +# Should contain $ORIGIN, $ORIGIN/../lib, /opt/rocm/extras-7/lib +``` + +## Troubleshooting + +### S3 step fails with "Credentials could not be loaded" + +- PR from a fork: OIDC is unavailable; the upload step is skipped. +- Same-repo: confirm `AWS_ROLE_ARN` secret is set and the role's trust + policy allows `repo:ROCm/TransferBench:*`. + +### Build fails: missing `libibverbs.h` / `mpi.h` + +The packaged builds disable both `ENABLE_NIC_EXEC` and `ENABLE_MPI_COMM`, so these +headers are not required. If you've manually re-enabled either flag for a local +build, install the dev packages yourself: + +```bash +# Ubuntu — for ENABLE_NIC_EXEC=ON +sudo apt install -y libibverbs-dev rdma-core +# Ubuntu — for ENABLE_MPI_COMM=ON +sudo apt install -y libopenmpi-dev openmpi-bin +# Rocky/RHEL +sudo dnf install -y rdma-core-devel openmpi-devel +``` + +### TheRock tarball download 404s + +Check available builds at +. Set +`ROCM_VERSION` explicitly to a known-good version. + +## References + +- [TheRock Releases](https://github.com/ROCm/TheRock/blob/main/RELEASES.md) +- [TheRock nightly tarballs](https://therock-nightly-tarball.s3.amazonaws.com/index.html) +- [ROCmValidationSuite packaging workflow](https://github.com/ROCm/ROCmValidationSuite/blob/master/.github/workflows/README_BUILD_PACKAGES.md) — reference implementation +- [TransferBench README](../../README.md) diff --git a/.github/workflows/build-relocatable-packages.yml b/.github/workflows/build-relocatable-packages.yml new file mode 100644 index 00000000..2f9d6a4b --- /dev/null +++ b/.github/workflows/build-relocatable-packages.yml @@ -0,0 +1,294 @@ +name: Build Relocatable Packages + +on: + push: + branches: [develop, mainline, 'release/**', candidate] + pull_request: + branches: [develop, mainline] + schedule: + # Daily at 13:00 UTC (5:00 AM PST) + - cron: '0 13 * * *' + workflow_dispatch: + inputs: + rocm_version: + description: 'ROCm version (empty = auto-fetch latest from TheRock)' + required: false + default: '' + gpu_family: + description: 'GPU family target' + required: false + default: 'gfx94X-dcgpu' + type: choice + options: + - gfx94X-dcgpu + - gfx950-dcgpu + - gfx110X-all + - gfx120X-all + - gfx1151 + +permissions: + contents: read + id-token: write # Required for OIDC S3 upload + +env: + ROCM_VERSION: ${{ github.event.inputs.rocm_version || '' }} + GPU_FAMILY: ${{ github.event.inputs.gpu_family || 'gfx94X-dcgpu' }} + BUILD_TYPE: Release + +jobs: + # ============================================================ + # Ubuntu 22.04 — DEB + TGZ + # ============================================================ + build-ubuntu: + name: Build (Ubuntu 22.04) + runs-on: ${{ vars.RUNNER_LABEL || 'ubuntu-22.04' }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 # for branch.commit version tags + + - name: Set environment + run: | + echo "ROCM_VERSION=${ROCM_VERSION}" >> "$GITHUB_ENV" + echo "GPU_FAMILY=${GPU_FAMILY}" >> "$GITHUB_ENV" + echo "BUILD_TYPE=${BUILD_TYPE}" >> "$GITHUB_ENV" + + - name: Build packages + run: | + chmod +x ./build_packages_local.sh + sudo -E ROCM_VERSION="${ROCM_VERSION}" \ + GPU_FAMILY="${GPU_FAMILY}" \ + BUILD_TYPE="${BUILD_TYPE}" \ + GITHUB_RUN_NUMBER="${GITHUB_RUN_NUMBER}" \ + GITHUB_REF_NAME="${GITHUB_REF_NAME}" \ + ./build_packages_local.sh + + - name: Verify DEB package + run: | + shopt -s nullglob + for deb in build/amdrocm*-transferbench*.deb; do + echo "==> ${deb}" + dpkg-deb -I "${deb}" + dpkg-deb -c "${deb}" | head -50 + done + + - name: Upload artifacts (always, for inspection) + uses: actions/upload-artifact@v4 + with: + name: ubuntu-22.04-packages + path: | + build/amdrocm*-transferbench*.deb + build/amdrocm*-transferbench*.tar.gz + if-no-files-found: error + + - name: Configure AWS credentials (OIDC) + if: github.repository == 'ROCm/TransferBench' && vars.AWS_S3_BUCKET != '' + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: us-east-1 + + - name: Upload to S3 + if: github.repository == 'ROCm/TransferBench' && vars.AWS_S3_BUCKET != '' + env: + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + run: | + set -euo pipefail + if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then + S3_PATH="s3://${AWS_S3_BUCKET}/transferbench/${GITHUB_HEAD_REF//\//_}/${GITHUB_RUN_NUMBER}/ubuntu-22.04" + METADATA="skip" + elif [[ "${GITHUB_REF_NAME}" == release/* ]]; then + S3_PATH="s3://${AWS_S3_BUCKET}/release/transferbench/deb" + METADATA="generate" + else + S3_PATH="s3://${AWS_S3_BUCKET}/nightly/transferbench/deb" + METADATA="generate" + fi + echo "S3_DEB_PATH=${S3_PATH}" >> "$GITHUB_ENV" + echo "DEB_METADATA=${METADATA}" >> "$GITHUB_ENV" + aws s3 cp build/ "${S3_PATH}/" --recursive --exclude "*" \ + --include "amdrocm*-transferbench*.deb" \ + --include "amdrocm*-transferbench*.tar.gz" + echo "Uploaded to ${S3_PATH}" + + - name: Generate apt repo metadata + if: github.repository == 'ROCm/TransferBench' && env.DEB_METADATA == 'generate' + env: + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + run: | + set -euo pipefail + WORK="$(mktemp -d)" + aws s3 sync "${S3_DEB_PATH}/" "${WORK}/" --exclude "*" --include "*.deb" + pushd "${WORK}" >/dev/null + dpkg-scanpackages -m . /dev/null > Packages + gzip -kf Packages + { + echo "Origin: ROCm-TransferBench" + echo "Label: ROCm TransferBench Packages" + echo "Suite: stable" + echo "Codename: stable" + echo "Architectures: amd64" + echo "Components: main" + echo "Description: TransferBench DEB packages built from TheRock SDK" + echo "Date: $(date -Ru)" + } > Release + aws s3 cp Packages "${S3_DEB_PATH}/Packages" + aws s3 cp Packages.gz "${S3_DEB_PATH}/Packages.gz" + aws s3 cp Release "${S3_DEB_PATH}/Release" + popd >/dev/null + + # ============================================================ + # manylinux_2_28 (AlmaLinux 8) — RPM + TGZ + # ============================================================ + build-manylinux: + name: Build (manylinux_2_28) + runs-on: ${{ vars.RUNNER_LABEL_CONTAINER || 'ubuntu-latest' }} + container: + image: quay.io/pypa/manylinux_2_28_x86_64 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Set environment + run: | + echo "ROCM_VERSION=${ROCM_VERSION}" >> "$GITHUB_ENV" + echo "GPU_FAMILY=${GPU_FAMILY}" >> "$GITHUB_ENV" + echo "BUILD_TYPE=${BUILD_TYPE}" >> "$GITHUB_ENV" + + - name: Build packages + run: | + chmod +x ./build_packages_local.sh + # No sudo: container runs as root + ROCM_VERSION="${ROCM_VERSION}" \ + GPU_FAMILY="${GPU_FAMILY}" \ + BUILD_TYPE="${BUILD_TYPE}" \ + GITHUB_RUN_NUMBER="${GITHUB_RUN_NUMBER}" \ + GITHUB_REF_NAME="${GITHUB_REF_NAME}" \ + ./build_packages_local.sh + + - name: Verify RPM package + run: | + shopt -s nullglob + for rpm in build/amdrocm*-transferbench*.rpm; do + echo "==> ${rpm}" + rpm -qip "${rpm}" + rpm -qlp "${rpm}" | head -50 + done + + - name: Upload artifacts (always, for inspection) + uses: actions/upload-artifact@v4 + with: + name: manylinux_2_28-packages + path: | + build/amdrocm*-transferbench*.rpm + build/amdrocm*-transferbench*.tar.gz + if-no-files-found: error + + - name: Install AWS CLI + if: github.repository == 'ROCm/TransferBench' && vars.AWS_S3_BUCKET != '' + run: | + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscli.zip + (cd /tmp && unzip -q awscli.zip && ./aws/install) + + - name: Configure AWS credentials (OIDC) + if: github.repository == 'ROCm/TransferBench' && vars.AWS_S3_BUCKET != '' + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: us-east-1 + + - name: Upload to S3 + if: github.repository == 'ROCm/TransferBench' && vars.AWS_S3_BUCKET != '' + env: + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + run: | + set -euo pipefail + if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then + S3_RPM="s3://${AWS_S3_BUCKET}/transferbench/${GITHUB_HEAD_REF//\//_}/${GITHUB_RUN_NUMBER}/manylinux_2_28" + S3_TAR="${S3_RPM}" + METADATA="skip" + elif [[ "${GITHUB_REF_NAME}" == release/* ]]; then + S3_RPM="s3://${AWS_S3_BUCKET}/release/transferbench/rpm" + S3_TAR="s3://${AWS_S3_BUCKET}/release/transferbench/tar" + METADATA="generate" + else + S3_RPM="s3://${AWS_S3_BUCKET}/nightly/transferbench/rpm" + S3_TAR="s3://${AWS_S3_BUCKET}/nightly/transferbench/tar" + METADATA="generate" + fi + echo "S3_RPM_PATH=${S3_RPM}" >> "$GITHUB_ENV" + echo "S3_TAR_PATH=${S3_TAR}" >> "$GITHUB_ENV" + echo "RPM_METADATA=${METADATA}" >> "$GITHUB_ENV" + aws s3 cp build/ "${S3_RPM}/" --recursive --exclude "*" --include "amdrocm*-transferbench*.rpm" + aws s3 cp build/ "${S3_TAR}/" --recursive --exclude "*" --include "amdrocm*-transferbench*.tar.gz" + echo "Uploaded RPM to ${S3_RPM}, TGZ to ${S3_TAR}" + + - name: Generate yum repo metadata + if: github.repository == 'ROCm/TransferBench' && env.RPM_METADATA == 'generate' + env: + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + run: | + set -euo pipefail + dnf install -y createrepo_c || yum install -y createrepo_c + WORK="$(mktemp -d)" + aws s3 sync "${S3_RPM_PATH}/" "${WORK}/" --exclude "*" --include "*.rpm" + createrepo_c "${WORK}" + aws s3 sync "${WORK}/repodata/" "${S3_RPM_PATH}/repodata/" --delete + + # ============================================================ + # Build report — collects S3 paths for browsing + # ============================================================ + release-summary: + name: Build Report + needs: [build-ubuntu, build-manylinux] + if: always() + runs-on: ${{ vars.RUNNER_LABEL_UTILITY || 'ubuntu-latest' }} + steps: + - name: Generate report + env: + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + run: | + set -euo pipefail + mkdir -p report + { + echo "# TransferBench Build Report" + echo "" + echo "- Event: \`${GITHUB_EVENT_NAME}\`" + echo "- Ref: \`${GITHUB_REF_NAME}\`" + echo "- Run number: \`${GITHUB_RUN_NUMBER}\`" + echo "- ROCm: \`${ROCM_VERSION:-auto}\`" + echo "- GPU family: \`${GPU_FAMILY}\`" + echo "- Ubuntu job: \`${{ needs.build-ubuntu.result }}\`" + echo "- manylinux: \`${{ needs.build-manylinux.result }}\`" + echo "" + if [[ -n "${AWS_S3_BUCKET:-}" ]]; then + echo "## S3 Upload Locations" + if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then + BASE="transferbench/${GITHUB_HEAD_REF//\//_}/${GITHUB_RUN_NUMBER}" + echo "- [DEB (Ubuntu)](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=${BASE}/ubuntu-22.04/)" + echo "- [RPM/TGZ (manylinux)](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=${BASE}/manylinux_2_28/)" + elif [[ "${GITHUB_REF_NAME}" == release/* ]]; then + echo "- [DEB](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=release/transferbench/deb/)" + echo "- [RPM](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=release/transferbench/rpm/)" + echo "- [TGZ](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=release/transferbench/tar/)" + else + echo "- [DEB](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=nightly/transferbench/deb/)" + echo "- [RPM](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=nightly/transferbench/rpm/)" + echo "- [TGZ](https://s3.console.aws.amazon.com/s3/buckets/${AWS_S3_BUCKET}?prefix=nightly/transferbench/tar/)" + fi + else + echo "_S3 upload not configured (\`AWS_S3_BUCKET\` variable not set)._" + fi + } > report/build-report.md + cat report/build-report.md >> "$GITHUB_STEP_SUMMARY" + + - name: Upload report + uses: actions/upload-artifact@v4 + with: + name: build-report + path: report/build-report.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 2da6baa9..2b6591d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF) option(ENABLE_MPI_COMM "Enable MPI Communicator support" OFF) option(DISABLE_DMABUF "Disable DMA-BUF support for GPU Direct RDMA" ON) +option(BUILD_RELOCATABLE_PACKAGE "Build with RVS-style relocatable RPATH and amdrocm-transferbench package naming" OFF) # Default GPU architectures to build #================================================================================================== @@ -263,20 +264,73 @@ target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY}) # Required on AlmaLinux 8 / manylinux_2_28; harmless no-op stub on newer toolchains. target_link_libraries(TransferBench PRIVATE stdc++fs) -rocm_install(TARGETS TransferBench COMPONENT devel) -rocm_setup_version(VERSION ${VERSION_STRING}) +if(BUILD_RELOCATABLE_PACKAGE) + # RVS-style relocatable packaging: bypass rocm_install/rocm_create_package and + # drive CPack directly so CMAKE_INSTALL_PREFIX / CPACK_PACKAGING_INSTALL_PREFIX + # set by the caller (build_packages_local.sh) are honored. + if(NOT DEFINED ROCM_MAJOR_VERSION) + set(ROCM_MAJOR_VERSION "7") + endif() -# Package specific CPACK vars -rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr") + install(TARGETS TransferBench RUNTIME DESTINATION bin COMPONENT devel) + + set(CPACK_PACKAGE_NAME "amdrocm${ROCM_MAJOR_VERSION}-transferbench") + set(CPACK_PACKAGE_VERSION "${VERSION_STRING}") + set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") + set(CPACK_PACKAGE_CONTACT "RCCL Team ") + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "TransferBench: benchmark simultaneous transfers between CPU/GPU/NIC") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") + + # DEB + set(CPACK_DEBIAN_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") + set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64") + set(CPACK_DEBIAN_PACKAGE_DEPENDS "numactl, libnuma1, hsa-rocr") + set(CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT}") + if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) + set(CPACK_DEBIAN_PACKAGE_RELEASE "$ENV{CPACK_DEBIAN_PACKAGE_RELEASE}") + endif() -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") -set(CPACK_RPM_PACKAGE_LICENSE "MIT") + # RPM + set(CPACK_RPM_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") + set(CPACK_RPM_PACKAGE_LICENSE "MIT") + set(CPACK_RPM_PACKAGE_REQUIRES "numactl, hsa-rocr") + set(CPACK_RPM_PACKAGE_VENDOR "${CPACK_PACKAGE_VENDOR}") + if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE}) + set(CPACK_RPM_PACKAGE_RELEASE "$ENV{CPACK_RPM_PACKAGE_RELEASE}") + endif() + # Use the actual install prefix (caller-controlled in relocatable mode) + # rather than hard-coded /opt/... paths. + if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX) + set(_rpm_exclude_prefix "${CPACK_PACKAGING_INSTALL_PREFIX}") + else() + set(_rpm_exclude_prefix "${CMAKE_INSTALL_PREFIX}") + endif() + set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION + "/opt" "/opt/rocm" + "${_rpm_exclude_prefix}" + "${_rpm_exclude_prefix}/bin") -set(PACKAGE_NAME TB) -set(LIBRARY_NAME TransferBench) + # TGZ + set(CPACK_ARCHIVE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-Linux") -rocm_create_package( - NAME ${LIBRARY_NAME} - DESCRIPTION "TransferBench package" - MAINTAINER "RCCL Team " -) + set(CPACK_GENERATOR "DEB;RPM;TGZ") + include(CPack) +else() + rocm_install(TARGETS TransferBench COMPONENT devel) + rocm_setup_version(VERSION ${VERSION_STRING}) + + # Package specific CPACK vars + rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr") + + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") + set(CPACK_RPM_PACKAGE_LICENSE "MIT") + + set(PACKAGE_NAME TB) + set(LIBRARY_NAME TransferBench) + + rocm_create_package( + NAME ${LIBRARY_NAME} + DESCRIPTION "TransferBench package" + MAINTAINER "RCCL Team " + ) +endif() diff --git a/build_packages_local.sh b/build_packages_local.sh new file mode 100755 index 00000000..d10ec86e --- /dev/null +++ b/build_packages_local.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +# +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# build_packages_local.sh — single source of truth for building relocatable +# TransferBench packages (DEB / RPM / TGZ) against TheRock ROCm SDK. +# Used by both local developers and the GitHub Actions workflow. +# +# Usage: +# sudo ./build_packages_local.sh +# sudo -E ROCM_VERSION=7.11.0a20260121 GPU_FAMILY=gfx94X-dcgpu ./build_packages_local.sh +# +# Requires root (installs system packages). + +set -euo pipefail + +# -------- pretty output -------- +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' +log() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +err() { echo -e "${RED}[FAIL]${NC} $*" >&2; } + +trap 'err "Build failed at line $LINENO"' ERR + +# -------- root check -------- +if [[ ${EUID} -ne 0 ]]; then + err "This script installs system packages and must run as root. Re-run with: sudo -E $0" + exit 1 +fi + +# -------- inputs -------- +ROCM_VERSION="${ROCM_VERSION:-}" # empty => auto-fetch latest +GPU_FAMILY="${GPU_FAMILY:-gfx94X-dcgpu}" +BUILD_TYPE="${BUILD_TYPE:-Release}" +GITHUB_RUN_NUMBER="${GITHUB_RUN_NUMBER:-1}" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="${REPO_ROOT}/build" +SDK_DIR="${HOME}/rocm-sdk" +ROCM_PATH="${SDK_DIR}/install" + +# Default GPU targets baked into every package, regardless of GPU_FAMILY tarball. +DEFAULT_GPU_TARGETS="gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201" +GPU_TARGETS="${GPU_TARGETS:-$DEFAULT_GPU_TARGETS}" + +# -------- detect OS -------- +if [[ -f /etc/os-release ]]; then + # shellcheck disable=SC1091 + . /etc/os-release + OS_ID="${ID:-unknown}" + OS_LIKE="${ID_LIKE:-}" +else + err "/etc/os-release not found; cannot detect distro"; exit 1 +fi + +case "${OS_ID}:${OS_LIKE}" in + ubuntu:*|debian:*|*:*debian*) DISTRO="ubuntu" ;; + almalinux:*|rocky:*|rhel:*|centos:*|*:*rhel*|*:*fedora*) DISTRO="almalinux" ;; + *) + if command -v apt-get >/dev/null 2>&1; then DISTRO="ubuntu" + elif command -v yum >/dev/null 2>&1 || command -v dnf >/dev/null 2>&1; then DISTRO="almalinux" + else err "Unsupported distro: ${OS_ID}"; exit 1 + fi + ;; +esac +log "Detected distro: ${DISTRO} (${OS_ID})" + +# -------- install dependencies -------- +log "Installing build dependencies..." +if [[ "${DISTRO}" == "ubuntu" ]]; then + export DEBIAN_FRONTEND=noninteractive + apt-get update -y + apt-get install -y --no-install-recommends \ + build-essential cmake git curl tar xz-utils ca-certificates pkg-config \ + python3 python3-pip \ + libnuma-dev \ + dpkg-dev rpm file apt-utils + CMAKE_BIN="cmake" + CMAKE_CXX_COMPILER_OVERRIDE="" +else + # AlmaLinux / Rocky / RHEL / manylinux_2_28 + if command -v dnf >/dev/null 2>&1; then PKG="dnf"; else PKG="yum"; fi + ${PKG} install -y epel-release || true + # Enable PowerTools/CRB for createrepo_c, etc. + ${PKG} config-manager --set-enabled powertools 2>/dev/null \ + || ${PKG} config-manager --set-enabled crb 2>/dev/null || true + ${PKG} install -y \ + gcc gcc-c++ make cmake3 git curl tar xz ca-certificates pkgconfig \ + python3 python3-pip \ + numactl-devel \ + rpm-build dpkg createrepo_c file + CMAKE_BIN="cmake3" + command -v cmake3 >/dev/null 2>&1 || CMAKE_BIN="cmake" + CMAKE_CXX_COMPILER_OVERRIDE="${ROCM_PATH}/bin/hipcc" +fi +ok "Dependencies installed" + +# -------- fetch ROCm SDK from TheRock -------- +TARBALL_BASE="https://therock-nightly-tarball.s3.amazonaws.com" +TAR_PREFIX="therock-dist-linux-${GPU_FAMILY}-" + +if [[ -z "${ROCM_VERSION}" ]]; then + log "ROCM_VERSION not set; auto-fetching latest for ${GPU_FAMILY}..." + # No LATEST.txt is published; list the bucket and pick the highest version key. + LIST_URL="${TARBALL_BASE}/?list-type=2&max-keys=1000&prefix=${TAR_PREFIX}" + # Filter to versioned tarballs only (skip ADHOCBUILD-* and other non-release keys); + # match: ..<...>.tar.gz + LATEST_KEY="$(curl -fsSL "${LIST_URL}" 2>/dev/null \ + | tr '<' '\n' \ + | sed -n 's|^Key>||p' \ + | grep -E "^${TAR_PREFIX}[0-9]+\.[0-9]+\.[0-9a-z]+\.tar\.gz$" \ + | sort -V \ + | tail -1 || true)" + if [[ -n "${LATEST_KEY}" ]]; then + ROCM_VERSION="${LATEST_KEY#${TAR_PREFIX}}" + ROCM_VERSION="${ROCM_VERSION%.tar.gz}" + ok "Latest ROCm version for ${GPU_FAMILY}: ${ROCM_VERSION}" + else + warn "Could not list ${LIST_URL}; falling back to pinned default" + ROCM_VERSION="7.13.0a20260423" + fi +fi + +TARBALL_NAME="${TAR_PREFIX}${ROCM_VERSION}.tar.gz" +TARBALL_URL="${TARBALL_BASE}/${TARBALL_NAME}" + +mkdir -p "${SDK_DIR}" +if [[ ! -d "${ROCM_PATH}" ]] || [[ ! -f "${SDK_DIR}/.installed-${ROCM_VERSION}-${GPU_FAMILY}" ]]; then + log "Downloading ${TARBALL_URL}..." + curl -fSL "${TARBALL_URL}" -o "${SDK_DIR}/${TARBALL_NAME}" + log "Extracting to ${SDK_DIR}..." + rm -rf "${ROCM_PATH}" + mkdir -p "${ROCM_PATH}" + tar -xzf "${SDK_DIR}/${TARBALL_NAME}" -C "${ROCM_PATH}" --strip-components=1 \ + || tar -xzf "${SDK_DIR}/${TARBALL_NAME}" -C "${ROCM_PATH}" + rm -f "${SDK_DIR}/${TARBALL_NAME}" + touch "${SDK_DIR}/.installed-${ROCM_VERSION}-${GPU_FAMILY}" + ok "ROCm SDK installed at ${ROCM_PATH}" +else + log "Reusing cached ROCm SDK at ${ROCM_PATH}" +fi + +export ROCM_PATH +export PATH="${ROCM_PATH}/bin:${PATH}" +export LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH:-}" +export CMAKE_PREFIX_PATH="${ROCM_PATH}:${CMAKE_PREFIX_PATH:-}" + +# Locate HIP device libraries (amdgcn bitcode) +for candidate in \ + "${ROCM_PATH}/amdgcn/bitcode" \ + "${ROCM_PATH}/lib/llvm/amdgcn/bitcode" \ + "${ROCM_PATH}/lib/clang/amdgcn/bitcode"; do + if [[ -d "${candidate}" ]]; then export HIP_DEVICE_LIB_PATH="${candidate}"; break; fi +done +if [[ -n "${HIP_DEVICE_LIB_PATH:-}" ]]; then + ok "HIP_DEVICE_LIB_PATH=${HIP_DEVICE_LIB_PATH}" +else + warn "amdgcn bitcode directory not found under ${ROCM_PATH}; build may fail" +fi + +# -------- compute version helpers -------- +# ROCM_MAJOR / MINOR / patch helpers (e.g. 7.11.0a20260121 -> major=7 minor=11) +ROCM_MAJOR="$(echo "${ROCM_VERSION}" | sed -E 's/^([0-9]+)\..*/\1/')" +ROCM_MINOR="$(echo "${ROCM_VERSION}" | sed -E 's/^[0-9]+\.([0-9]+).*/\1/')" +printf -v ROCM_LIBPATCH_VERSION '%02d%02d' "${ROCM_MAJOR}" "${ROCM_MINOR}" +export ROCM_MAJOR ROCM_MINOR ROCM_LIBPATCH_VERSION +log "ROCm major=${ROCM_MAJOR} minor=${ROCM_MINOR} libpatch=${ROCM_LIBPATCH_VERSION}" + +# Package release string: branch.commit for dev, run_number for release branches +GIT_BRANCH="${GITHUB_REF_NAME:-$(git -C "${REPO_ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo unknown)}" +GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)" +if [[ "${GIT_BRANCH}" == rel* ]] || [[ "${GIT_BRANCH}" == release/* ]]; then + PKG_RELEASE="${GITHUB_RUN_NUMBER}" +else + # Sanitize: DEB/RPM release fields disallow many punctuation chars. + # Collapse anything that's not [A-Za-z0-9] into a single dot, then trim. + SAFE_BRANCH="$(printf '%s' "${GIT_BRANCH}" | sed -E 's/[^[:alnum:]]+/./g; s/^\.+//; s/\.+$//')" + SAFE_BRANCH="${SAFE_BRANCH:-unknown}" + PKG_RELEASE="${SAFE_BRANCH}.${GIT_COMMIT}" +fi +export CPACK_DEBIAN_PACKAGE_RELEASE="${CPACK_DEBIAN_PACKAGE_RELEASE:-$PKG_RELEASE}" +export CPACK_RPM_PACKAGE_RELEASE="${CPACK_RPM_PACKAGE_RELEASE:-$PKG_RELEASE}" +log "Package release tag: ${PKG_RELEASE}" + +# -------- configure -------- +INSTALL_PREFIX="/opt/rocm/extras-${ROCM_MAJOR}" +# Relocatable RPATH: $ORIGIN-relative + install prefix + the conventional +# install-time ROCm locations. Do NOT embed ${ROCM_PATH} (the ephemeral +# build-time SDK download path) — that would leak CI paths into the +# packaged binary and break relocatability. +RPATH_LIST="\$ORIGIN:\$ORIGIN/../lib:${INSTALL_PREFIX}/lib:/opt/rocm/lib:/opt/rocm/lib64" + +log "Configuring CMake..." +rm -rf "${BUILD_DIR}" +mkdir -p "${BUILD_DIR}" + +CMAKE_ARGS=( + -B "${BUILD_DIR}" + -S "${REPO_ROOT}" + -DCMAKE_BUILD_TYPE="${BUILD_TYPE}" + -DROCM_PATH="${ROCM_PATH}" + -DROCM_MAJOR_VERSION="${ROCM_MAJOR}" + -DHIP_PLATFORM=amd + -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" + -DCPACK_PACKAGING_INSTALL_PREFIX="${INSTALL_PREFIX}" + -DCMAKE_SKIP_RPATH=FALSE + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE + -DCMAKE_INSTALL_RPATH="${RPATH_LIST}" + -DCMAKE_VERBOSE_MAKEFILE=ON + -DBUILD_RELOCATABLE_PACKAGE=ON + -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF + -DENABLE_NIC_EXEC=OFF + -DENABLE_MPI_COMM=OFF + -DDISABLE_DMABUF=OFF + -DGPU_TARGETS="${GPU_TARGETS}" +) +if [[ -n "${CMAKE_CXX_COMPILER_OVERRIDE}" ]]; then + CMAKE_ARGS+=(-DCMAKE_CXX_COMPILER="${CMAKE_CXX_COMPILER_OVERRIDE}") +fi + +"${CMAKE_BIN}" "${CMAKE_ARGS[@]}" +ok "CMake configured" + +# -------- build -------- +log "Building TransferBench (-j$(nproc))..." +"${CMAKE_BIN}" --build "${BUILD_DIR}" -- -j"$(nproc)" +ok "Build complete" + +# -------- package -------- +log "Packaging (DEB / RPM / TGZ via CPack)..." +pushd "${BUILD_DIR}" >/dev/null +if [[ "${DISTRO}" == "ubuntu" ]]; then + cpack -G DEB + cpack -G TGZ +else + cpack -G RPM + cpack -G TGZ +fi +popd >/dev/null + +ok "Packages written under ${BUILD_DIR}:" +ls -lh "${BUILD_DIR}"/amdrocm*-transferbench* 2>/dev/null || ls -lh "${BUILD_DIR}"/*.deb "${BUILD_DIR}"/*.rpm "${BUILD_DIR}"/*.tar.gz 2>/dev/null || true