Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ if [[ "$image" == *-focal* ]]; then
UBUNTU_VERSION=20.04
elif [[ "$image" == *-jammy* ]]; then
UBUNTU_VERSION=22.04
elif [[ "$image" == *-noble* ]]; then
UBUNTU_VERSION=24.04
elif [[ "$image" == *ubuntu* ]]; then
extract_version_from_image_name ubuntu UBUNTU_VERSION
elif [[ "$image" == *centos* ]]; then
Expand Down
3 changes: 3 additions & 0 deletions .ci/docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ install_ubuntu() {
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
cmake3="cmake=3.22*"
maybe_libiomp_dev=""
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
cmake3="cmake=3.28*"
maybe_libiomp_dev=""
else
cmake3="cmake=3.5*"
maybe_libiomp_dev="libiomp-dev"
Expand Down
4 changes: 4 additions & 0 deletions .ci/docker/common/install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ fi
conda_install_through_forge libstdcxx-ng=12
fi

if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$UBUNTU_VERSION" == "24.04"* ] ; then
conda_install_through_forge libstdcxx-ng=14
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the libstdcxx-ng version dependent on Ubuntu version or python version?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I recall correctly this is the lowest available for ub24, python doesn't have a strict requirement for this version. But I'm not 100% sure, worked on this some time ago.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like at the minimum, this condition needs to be on OS version. I assume the reason is similar to pytorch#121556 where we get symbol version errors when building PyTorch?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is dependent on UB 24.04 and minimum python version is 3.12, so I will update the condition.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the condition.

fi

# Install some other packages, including those needed for Python test reporting
pip_install -r /opt/conda/requirements-ci.txt

Expand Down
5 changes: 5 additions & 0 deletions .ci/docker/common/install_rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ install_ubuntu() {
# gpg-agent is not available by default on 20.04
apt-get install -y --no-install-recommends gpg-agent
fi
if [[ $UBUNTU_VERSION == 24.04 ]]; then
apt-get install -y --no-install-recommends gpg-agent
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
| sudo tee /etc/apt/preferences.d/rocm-pin-600
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to DevOps, this is needed for Ubuntu 22.04 onwards (as also confirmed by https://github.com/ROCm/ROCm-docker/blob/master/build_all.sh). So we should:

  1. update this condition to be applicable for 22.04 onwards,
  2. cherry-pick this change into all release branches and
  3. Remove the corresponding patch logic in DevOps's build_pytorch.bash

fi
apt-get install -y kmod
apt-get install -y wget

Expand Down
7 changes: 7 additions & 0 deletions .ci/docker/common/install_user.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

set -ex

# Since version 24 the system ships with user 'ubuntu' that has id 1000
# We need a work-around to enable id 1000 usage for this script
if [[ $UBUNTU_VERSION == 24.04 ]]; then
# touch is used to disable harmless error message
touch /var/mail/ubuntu && chown ubuntu /var/mail/ubuntu && userdel -r ubuntu
fi
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would seem to be a problem for upstream PyTorch as well. Can we file an upstream GitHub issue with logs and error snippets so that they're aware of this and might come up with a different way to address this? It's okay to merge this patch in rocm6.3_internal_testing though.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iupaikov-amd Can you please file a github issue on pytorch/pytorch for this? Upstream PyTorch team would like some more details. Please discuss with Pruthvi or me if you have questions regarding what info to mention on the issue.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upstream issue about install_user.sh: pytorch#138812


# Mirror jenkins user in container
# jenkins user as ec2-user should have the same user-id
echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
Expand Down
3 changes: 3 additions & 0 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ sympy==1.12.1
#test that import:

coremltools==5.0b5 ; python_version < "3.12"
coremltools==7.2 ; python_version == "3.12"
#Description: Apple framework for ML integration
#Pinned versions: 5.0b5
#test that import:
Expand Down Expand Up @@ -64,6 +65,7 @@ lark==0.12.0
#test that import:

librosa>=0.6.2 ; python_version < "3.11"
librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py
Expand Down Expand Up @@ -112,6 +114,7 @@ networkx==2.8.8
numba==0.49.0 ; python_version < "3.9"
numba==0.54.1 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
Expand Down
2 changes: 1 addition & 1 deletion c10/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ if(NOT BUILD_LIBTORCHLESS)
# ---[ Dependency of c10_hip
target_link_libraries(c10_hip PUBLIC c10)

target_link_libraries(c10_hip PUBLIC ${PYTORCH_HIP_LIBRARIES})
target_link_libraries(c10_hip PUBLIC ${PYTORCH_HIP_LIBRARIES} ${ROCM_HSART_LIB})

target_include_directories(
c10_hip PUBLIC
Expand Down
3 changes: 2 additions & 1 deletion caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ if(USE_ROCM)
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
# See NOTE [ ATen NVRTC Stub and HIP ]
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_HSART_LIB})
target_include_directories(caffe2_nvrtc PRIVATE ${CMAKE_BINARY_DIR})
target_compile_definitions(caffe2_nvrtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
Expand Down Expand Up @@ -1417,6 +1417,7 @@ target_link_libraries(torch_cpu PUBLIC c10)
target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
target_link_libraries(torch_cpu PUBLIC ${ROCM_HSART_LIB})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other ROCM_HSART_LIB usages seem like they should be avoidable as well, but this one seems the most egregious, being a torch_cpu dependency? I wonder if @naromero77amd's latest refactor in https://github.com/pytorch/pytorch/pull/137112/files might help with all the ROCM_HSART_LIB occurrences because it uses CMake targets instead of paths to .so files.

Copy link
Collaborator Author

@pruthvistony pruthvistony Oct 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jithunnair-amd ,
torch_cpu is already dependent on libamdhip64.so even before this change.

root@ctr-ubbsmc12:/var/lib/jenkins/pytorch/build/lib# ldd libtorch_cpu.so
linux-vdso.so.1 (0x00007fff799f6000)
libc10.so (0x00007fd3ae78e000)
libgcc_s.so.1 => /opt/conda/envs/py_3.10/lib/libgcc_s.so.1 (0x00007fd3ae775000)
libmkl_intel_lp64.so.1 => /opt/conda/envs/py_3.10/lib/libmkl_intel_lp64.so.1 (0x00007fd3adbd6000)
libmkl_gnu_thread.so.1 => /opt/conda/envs/py_3.10/lib/libmkl_gnu_thread.so.1 (0x00007fd3ac04b000)
libmkl_core.so.1 => /opt/conda/envs/py_3.10/lib/libmkl_core.so.1 (0x00007fd3a7bdb000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fd3a7ae6000)
libgomp.so.1 => /opt/conda/envs/py_3.10/lib/libgomp.so.1 (0x00007fd3a7aad000)
libroctracer64.so.4 => /opt/rocm/lib/libroctracer64.so.4 (0x00007fd3a7a44000)
libamdhip64.so.6 => /opt/rocm/lib/libamdhip64.so.6 (0x00007fd3a6181000)
libstdc++.so.6 => /opt/conda/envs/py_3.10/lib/libstdc++.so.6 (0x00007fd3a5fcd000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fd3a5da2000)
/lib64/ld-linux-x86-64.so.2 (0x00007fd3bb09b000)
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fd3a5d95000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fd3a5d90000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fd3a5d8b000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fd3a5d86000)
libhsa-runtime64.so.1 => /opt/rocm/lib/libhsa-runtime64.so.1 (0x00007fd3a5a49000)
librocprofiler-register.so.0 => /opt/rocm/lib/librocprofiler-register.so.0 (0x00007fd3a59c7000)
libamd_comgr.so.2 => /opt/rocm/lib/libamd_comgr.so.2 (0x00007fd39b4e1000)
libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fd39b4c3000)
libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fd39b4a9000)
libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fd39b497000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fd39b47b000)
libzstd.so.1 => /lib/x86_64-linux-gnu/libzstd.so.1 (0x00007fd39b3ac000)
libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fd39b37a000)

if(USE_MPI)
target_link_libraries(torch_cpu PRIVATE MPI::MPI_CXX)
endif()
Expand Down
2 changes: 1 addition & 1 deletion cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1081,7 +1081,7 @@ if(USE_ROCM)
hip_include_directories(${Caffe2_HIP_INCLUDE})

set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB} ${ROCM_HSART_LIB})
list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES})

list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
Expand Down
2 changes: 2 additions & 0 deletions cmake/public/LoadHIP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ if(HIP_FOUND)
find_library(ROCM_HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib)
# roctx is part of roctracer
find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
# HSA runtime lib
find_library(ROCM_HSART_LIB hsa-runtime64 HINTS ${ROCM_PATH}/lib)

# check whether HIP declares new types
set(file "${PROJECT_BINARY_DIR}/hip_new_types.cc")
Expand Down
2 changes: 1 addition & 1 deletion torch/lib/libshm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ if(BUILD_LIBTORCHLESS)
target_link_libraries(torch_shm_manager PRIVATE shm ${C10_LIB})
else()
# we need to link directly to c10 here otherwise we miss symbols
target_link_libraries(torch_shm_manager PRIVATE shm c10)
target_link_libraries(torch_shm_manager PRIVATE shm c10 ${ROCM_HSART_LIB})
endif()
set_target_properties(torch_shm_manager PROPERTIES
INSTALL_RPATH "${_rpath_portable_origin}/../lib")
Expand Down