Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ci/official/any.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ if [[ -n "${TF_ANY_EXTRA_ENV:-}" ]]; then
export TFCI="$TFCI,$TF_ANY_EXTRA_ENV"
fi
if [[ -n "${TF_ANY_SCRIPT:-}" ]]; then
"$TF_ANY_SCRIPT"
# To run ROCM tests inside docker
source "${BASH_SOURCE%/*}/utilities/setup.sh"
tfrun "$TF_ANY_SCRIPT"
elif [[ -n "${TF_ANY_TARGETS:-}" ]]; then
source "${BASH_SOURCE%/*}/utilities/setup.sh"
tfrun bazel "${TF_ANY_MODE:-test}" $TFCI_BAZEL_COMMON_ARGS $TF_ANY_TARGETS
Expand Down
98 changes: 98 additions & 0 deletions ci/official/containers/ml_build/Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
################################################################################
ARG BASE_IMAGE=ubuntu:22.04@sha256:58b87898e82351c6cf9cf5b9f3c20257bb9e2dcf33af051e12ce532d7f94e3fe
FROM $BASE_IMAGE AS devel
# See https://docs.docker.com/reference/dockerfile/#understand-how-arg-and-from-interact
# on why we cannot reference BASE_IMAGE again unless we declare it again.
################################################################################

# Install devtoolset build dependencies
COPY setup.sources.sh /setup.sources.sh
COPY setup.packages.sh /setup.packages.sh
COPY builder.packages.txt /builder.packages.txt

RUN /setup.sources.sh && /setup.packages.sh /builder.packages.txt

# Install devtoolset-9 in /dt9 with glibc 2.17 and libstdc++ 4.8, for building
# manylinux2014-compatible packages.
COPY builder.devtoolset/fixlinks.sh /fixlinks.sh
COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
COPY builder.devtoolset/glibc2.17-inline.patch /glibc2.17-inline.patch
RUN /build_devtoolset.sh devtoolset-9 /dt9

# Setup Python
COPY setup.python.sh /setup.python.sh
COPY builder.requirements.txt /builder.requirements.txt
RUN /setup.python.sh python3.9 /builder.requirements.txt
RUN /setup.python.sh python3.10 /builder.requirements.txt
RUN /setup.python.sh python3.11 /builder.requirements.txt
RUN /setup.python.sh python3.13 /builder.requirements.txt
RUN /setup.python.sh python3.13-nogil /builder.requirements.txt
RUN /setup.python.sh python3.14 /builder.requirements.txt
RUN /setup.python.sh python3.14-nogil /builder.requirements.txt

# Since we are using python3.12 as the default python version, we need to
# install python3.12 last for now.
# TODO(b/376338367): switch to pyenv.
RUN /setup.python.sh python3.12 /builder.requirements.txt

# Install ROCm packages
ARG GPU_DEVICE_TARGETS="gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201"
ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
ENV TF_ROCM_AMDGPU_TARGETS=${GPU_DEVICE_TARGETS}
ARG ROCM_VERSION=6.2.0
ARG CUSTOM_INSTALL
ARG ROCM_PATH=/opt/rocm-${ROCM_VERSION}
ENV ROCM_PATH=${ROCM_PATH}
COPY ${CUSTOM_INSTALL} /${CUSTOM_INSTALL}
COPY setup.rocm.sh /setup.rocm.sh
COPY devel.packages.rocm.txt /devel.packages.rocm.txt
RUN /setup.rocm.sh $ROCM_VERSION jammy




# Setup links for TensorFlow to compile.
# Referenced in devel.usertools/*.bazelrc.
# Set python3.12 as the default python version.
# TF does not support python3.13.
RUN ln -sf /usr/bin/python3.12 /usr/bin/python3
RUN ln -sf /usr/bin/python3.12 /usr/bin/python
RUN ln -sf /usr/lib/python3.12 /usr/lib/tf_python

# Make sure clang is on the path
RUN ln -s /usr/lib/llvm-18/bin/clang /usr/bin/clang


# Install various tools.
# - bats: bash unit testing framework
# - bazelisk: always use the correct bazel version
# - buildifier: clean bazel build deps
# - buildozer: clean bazel build deps
# - gcloud SDK: communicate with Google Cloud Platform (GCP) for RBE, CI
# - patchelf: Utility tool to modify existing ELF executables and libraries
RUN git clone --branch v1.11.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.21.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-amd64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildozer-linux-amd64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz | \
tar zxf - google-cloud-sdk && \
google-cloud-sdk/install.sh --quiet && \
ln -s /google-cloud-sdk/bin/gcloud /usr/bin/gcloud
ENV PATH="$PATH:/google-cloud-sdk/bin/"

# Download and install patchelf v0.18.0 from GitHub. The default Ubuntu focal
# packages only provide the "0.10-2build1" version. We use patchelf to manipulate
# certain shared libraries during the wheel building process (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/build_pip_package.sh#L255-L262).
# When we use Patchelf versions <0.12, those shared libraries end up with a
# corrupted PT_NOTE program header. This was fixed in v0.12, see https://github.com/NixOS/patchelf/commit/43a33482b501b0f5ee9da312aabfca3806570cc9.
RUN wget https://github.com/NixOS/patchelf/releases/download/0.18.0/patchelf-0.18.0-x86_64.tar.gz && tar -zxvf patchelf-0.18.0-x86_64.tar.gz -C /usr && rm -rf patchelf-0.18.0-x86_64.tar.gz

# Setup ENV variables for tensorflow pip build
ENV TF_NEED_ROCM=1
ENV TF_ROCM_GCC=1
ENV ROCM_TOOLKIT_PATH=${ROCM_PATH}

# Don't use the bazel cache when a new docker image is created.
RUN echo build --action_env=DOCKER_CACHEBUSTER=$(date +%s%N)$RANDOM >> /etc/bazel.bazelrc
RUN echo build --host_action_env=DOCKER_HOST_CACHEBUSTER=$(date +%s%N)$RANDOM >> /etc/bazel.bazelrc
42 changes: 42 additions & 0 deletions ci/official/containers/ml_build/devel.packages.rocm.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# All required ROCM packages
rocm-ml-sdk
rocm-dev

# Other build-related tools
apt-transport-https
ca-certificates
colordiff
llvm-18
clang-18
libclang-rt-18-dev
curl
ffmpeg
git
gpg-agent
jq
less
libcurl3-dev
libcurl4-openssl-dev
libfreetype6-dev
libhdf5-serial-dev
libssl-dev
libtool
libzmq3-dev
lld-18
moreutils
openjdk-11-jdk
openjdk-11-jre-headless
patchelf
pkg-config
python3-dev
python3-setuptools
python3-pip
rsync
software-properties-common
sudo
swig
unzip
vim
wget
zip
zlib1g-dev
152 changes: 152 additions & 0 deletions ci/official/containers/ml_build/setup.rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env bash
#
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# setup.rocm.sh: Prepare the ROCM installation on the container.
# Usage: setup.rocm.sh <ROCM_VERSION> <DISTRO>
# Supported Distros:
# - focal
# - jammy
# - el7
# - el8
set -x

# Get arguments (or defaults)
ROCM_VERSION=6.2.0
DISTRO=focal
if [[ -n $1 ]]; then
ROCM_VERSION=$1
fi
if [[ -n $2 ]]; then
if [[ "$2" == "focal" ]] || [[ "$2" == "jammy" ]] || [[ "$2" == "noble" ]] || [[ "$2" == "el7" ]] || [[ "$2" == "el8" ]]; then
DISTRO=$2
else
echo "Distro not supported"
echo "Supported distros are:\n focal\n jammy\n noble\n el7\n el8"
exit 1
fi
fi

ROCM_PATH=${ROCM_PATH:-/opt/rocm-${ROCM_VERSION}}
# Intial release don't have the trialing '.0'
# For example ROCM 5.4.0 is at https://repo.radeon.com/rocm/apt/5.4/
if [ ${ROCM_VERSION##*[^0-9]} -eq '0' ]; then
ROCM_VERS=${ROCM_VERSION%.*}
else
ROCM_VERS=$ROCM_VERSION
fi

if [[ "$DISTRO" == "focal" ]] || [[ "$DISTRO" == "jammy" ]] || [[ "$DISTRO" == "noble" ]]; then
ROCM_DEB_REPO_HOME=https://repo.radeon.com/rocm/apt/
AMDGPU_DEB_REPO_HOME=https://repo.radeon.com/amdgpu/
ROCM_BUILD_NAME=${DISTRO}
ROCM_BUILD_NUM=main

# Adjust the ROCM repo location
ROCM_DEB_REPO=${ROCM_DEB_REPO_HOME}${ROCM_VERS}/
AMDGPU_DEB_REPO=${AMDGPU_DEB_REPO_HOME}${ROCM_VERS}/

DEBIAN_FRONTEND=noninteractive apt-get --allow-unauthenticated update
DEBIAN_FRONTEND=noninteractive apt install -y wget software-properties-common
DEBIAN_FRONTEND=noninteractive apt-get clean all

if [ ! -f "/${CUSTOM_INSTALL}" ]; then
# Add rocm repository
#chmod 1777 /tmp
#wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -;

# Make the directory if it doesn't exist yet.
# This location is recommended by the distribution maintainers.
mkdir --parents --mode=0755 /etc/apt/keyrings

# Download the key, convert the signing-key to a full
# keyring required by apt and store in the keyring directory
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null

echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] $AMDGPU_DEB_REPO/ubuntu $ROCM_BUILD_NAME $ROCM_BUILD_NUM" | tee --append /etc/apt/sources.list.d/amdgpu.list
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" | tee /etc/apt/sources.list.d/rocm.list
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
| tee /etc/apt/preferences.d/rocm-pin-600
else
bash "/${CUSTOM_INSTALL}"
fi
apt-get update --allow-insecure-repositories


# install rocm
/setup.packages.sh /devel.packages.rocm.txt

MIOPENKERNELS=$( \
apt-cache search --names-only miopen-hip-gfx | \
awk '{print $1}' | \
grep -F -v . || \
true )
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}

#install hipblasLT if available
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated hipblaslt-dev || true

elif [[ "$DISTRO" == "el7" ]]; then
if [ ! -f "/${CUSTOM_INSTALL}" ]; then
RPM_ROCM_REPO=http://repo.radeon.com/rocm/yum/${ROCM_VERS}/main
echo -e "[ROCm]\nname=ROCm\nbaseurl=$RPM_ROCM_REPO\nenabled=1\ngpgcheck=0" >>/etc/yum.repos.d/rocm.repo
echo -e "[amdgpu]\nname=amdgpu\nbaseurl=https://repo.radeon.com/amdgpu/${ROCM_VERS}/rhel/7/main/x86_64/\nenabled=1\ngpgcheck=0" >>/etc/yum.repos.d/amdgpu.repo
else
bash "/${CUSTOM_INSTALL}"
fi
yum clean all

# install rocm
/setup.packages.rocm.cs7.sh /devel.packages.rocm.cs7.txt

# install hipblasLT if available
yum --enablerepo=extras install -y hipblaslt-devel || true

elif [[ "$DISTRO" == "el8" ]]; then
if [ ! -f "/${CUSTOM_INSTALL}" ]; then
RPM_ROCM_REPO=http://repo.radeon.com/rocm/rhel8/${ROCM_VERS}/main
echo -e "[ROCm]\nname=ROCm\nbaseurl=$RPM_ROCM_REPO\nenabled=1\ngpgcheck=1\ngpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" >>/etc/yum.repos.d/rocm.repo
echo -e "[amdgpu]\nname=amdgpu\nbaseurl=https://repo.radeon.com/amdgpu/${ROCM_VERS}/rhel/8.10/main/x86_64/\nenabled=1\ngpgcheck=1\ngpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" >>/etc/yum.repos.d/amdgpu.repo
else
bash "/${CUSTOM_INSTALL}"
fi
dnf clean all

# install rocm
/setup.packages.rocm.el8.sh /devel.packages.rocm.el8.txt

# install hipblasLT if available
dnf --enablerepo=extras,epel,elrepo,build_system install -y hipblaslt-devel || true
fi

function ver { printf "%03d%03d%03d" $(echo "$1" | tr '.' ' '); }
# If hipcc uses llvm-17, in case of ROCM 6.0.x and 6.1.x and
# host compiler is llvm-18 leads to mismatch in name mangling resulting
# in faliure to link compiled gpu kernels. This linker option circumvents that issue.
if [ $(ver "$ROCM_VERSION") -lt $(ver "6.2.0") ]
then
echo "build:rocm_base --copt=-fclang-abi-compat=17" >> /etc/bazel.bazelrc
fi

echo $ROCM_VERSION
echo $ROCM_REPO
echo $ROCM_PATH
echo $GPU_DEVICE_TARGETS

# Ensure the ROCm target list is set up
printf '%s\n' ${GPU_DEVICE_TARGETS} | tr ',' ' ' | tee -a "$ROCM_PATH/bin/target.lst"
touch "${ROCM_PATH}/.info/version"
29 changes: 29 additions & 0 deletions ci/official/envs/linux_x86_rocm
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
source ci/official/envs/linux_x86
# Using image rocm/tensorflow-build:latest-jammy-pythonall-rocm7.0.2-ci_official
TFCI_DOCKER_IMAGE="rocm/tensorflow-build@sha256:abd4ae15bab1292ba5cb6f7feab43e167b34963f991fc911478e9de65d54b1a3"
TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config rocm"
TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow"
TFCI_DOCKER_ARGS="--device /dev/dri --device /dev/kfd --device=/dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged --shm-size 64G"
TFCI_LIB_SUFFIX="-gpu-linux-x86_64"
TFCI_ROCM_SMI_ENABLE=1
TFCI_NVIDIA_SMI_ENABLE=0
TFCI_BAZEL_CONFIG="rocm"
TFCI_LIB_SUFFIX="-gpu-linux-x86_64"
# TODO: Set back to 610M once the wheel size is fixed.
TFCI_WHL_SIZE_LIMIT=630M
TFCI_WHL_AUDIT_ENABLE=0
TFCI_WHL_BAZEL_TEST_ENABLE=0
1 change: 1 addition & 0 deletions ci/official/utilities/cleanup_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ $ docker exec -it tf bash
EOF

docker ps
docker rm -f tf-${TFCI_PYTHON_VERSION}
4 changes: 2 additions & 2 deletions ci/official/utilities/setup_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ if ! docker container inspect tf >/dev/null 2>&1 ; then
echo "GCE_METADATA_HOST=$IP_ADDR" >> $env_file
fi

docker run $TFCI_DOCKER_ARGS --name tf -w "$WORKING_DIR" -itd --rm \
docker run $TFCI_DOCKER_ARGS --name tf-$TFCI_PYTHON_VERSION -w "$WORKING_DIR" -itd --rm \
-v "$TFCI_GIT_DIR:$WORKING_DIR" \
--env-file "$env_file" \
"$TFCI_DOCKER_IMAGE" \
Expand All @@ -65,4 +65,4 @@ if ! docker container inspect tf >/dev/null 2>&1 ; then
fi

fi
tfrun() { docker exec tf "$@"; }
tfrun() { docker exec tf-$TFCI_PYTHON_VERSION "$@"; }
9 changes: 7 additions & 2 deletions ci/official/wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ if [[ "$TFCI_NVIDIA_SMI_ENABLE" == 1 ]]; then
tfrun nvidia-smi
fi

# Record GPU count and ROCM version status
if [[ "$TFCI_ROCM_SMI_ENABLE" == 1 ]]; then
tfrun rocm-smi
fi

# Update the version numbers for Nightly only
if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
python_bin=python3
Expand All @@ -39,7 +44,7 @@ if [[ "$TFCI_WHL_NUMPY_VERSION" == 1 ]]; then
cp ./ci/official/requirements_updater/numpy1_requirements/*.txt .
fi

tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS --config=cuda_wheel //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_BASE_ARGS $TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG --verbose_failures
tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS --config=$TFCI_BAZEL_CONFIG //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_BASE_ARGS $TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG --verbose_failures

tfrun "$TFCI_FIND_BIN" ./bazel-bin/tensorflow/tools/pip_package -iname "*.whl" -exec cp {} $TFCI_OUTPUT_DIR \;
tfrun mkdir -p ./dist
Expand All @@ -53,7 +58,7 @@ if [[ -n "$TFCI_BUILD_PIP_PACKAGE_ADDITIONAL_WHEEL_NAMES" ]]; then
for wheel_name in ${TFCI_BUILD_PIP_PACKAGE_ADDITIONAL_WHEEL_NAMES}; do
echo "Building for additional WHEEL_NAME: ${wheel_name}"
CURRENT_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=${wheel_name}"
tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS --config=cuda_wheel //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_BASE_ARGS $CURRENT_WHEEL_NAME_ARG
tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS --config=$TFCI_BAZEL_CONFIG //tensorflow/tools/pip_package:wheel $TFCI_BUILD_PIP_PACKAGE_BASE_ARGS $CURRENT_WHEEL_NAME_ARG
# Copy the wheel that was just created
tfrun bash -c "$TFCI_FIND_BIN ./bazel-bin/tensorflow/tools/pip_package -iname "${wheel_name}*.whl" -printf '%T+ %p\n' | sort | tail -n 1 | awk '{print \$2}' | xargs -I {} cp {} $TFCI_OUTPUT_DIR"
done
Expand Down