From c627b57d592079de2c37f462f717568b4b17d439 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Tue, 18 Nov 2025 03:02:33 -0800 Subject: [PATCH 1/2] add support to distros other than Ubuntu --- lib/docker_lib.py | 74 ++++++++--- lib/linux_utils.py | 138 +++++++++++++++++++- tests/health/install/install_babelstream.py | 30 ++++- tests/health/install/install_rocblas.py | 31 ++++- tests/health/install/install_rvs.py | 28 +++- tests/health/rocblas_cvs.py | 22 +++- tests/ibperf/install_ibperf_tools.py | 30 ++++- 7 files changed, 306 insertions(+), 47 deletions(-) diff --git a/lib/docker_lib.py b/lib/docker_lib.py index 82a1c6ba..f4e99a9b 100644 --- a/lib/docker_lib.py +++ b/lib/docker_lib.py @@ -17,7 +17,7 @@ from utils_lib import * from verify_lib import * - +from linux_utils import detect_distro def get_running_docker_containers(phdl): @@ -86,21 +86,63 @@ def old_install_docker_on_ubuntu( phdl ): def install_docker_on_ubuntu( phdl ): - phdl.exec('sudo rm /etc/apt/keyrings/docker.gpg') - phdl.exec('sudo rm /etc/apt/sources.list.d/docker.list') - phdl.exec('sudo apt-get -y update') - phdl.exec('sudo apt install -y apt-transport-https ca-certificates curl software-properties-common') - phdl.exec('curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -') - phdl.exec('sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"') - phdl.exec('apt-cache policy docker-ce') - phdl.exec('sudo apt install -y docker-ce') - time.sleep(3) - phdl.exec('sudo systemctl start docker') - time.sleep(3) - phdl.exec('sudo systemctl status docker') - - - + cmds = ['sudo rm /etc/apt/keyrings/docker.gpg', + 'sudo rm /etc/apt/sources.list.d/docker.list', + 'sudo apt-get -y update', + 'sudo apt install -y apt-transport-https ca-certificates curl software-properties-common', + 'curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -', + 'sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"', + 'apt-cache policy docker-ce', + 'sudo apt install -y docker-ce'] + for cmd in cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + + sysctl_cmds = ['sudo systemctl start docker', + 'sudo systemctl enable docker'] + for cmd in sysctl_cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + time.sleep(3) + + +def install_docker_on_rhel(phdl): + #Install Docker on RHEL/CentOS/Fedora + cmds = ['sudo dnf -y install dnf-plugins-core', + 'sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo', + 'sudo dnf -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin', + 'sudo systemctl start docker', + 'sudo systemctl enable docker'] + for cmd in cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + time.sleep(3) + +def install_docker_on_suse(phdl): + #Install Docker on SLES + cmds = ['sudo zypper refresh', 'sudo zypper -n install docker', + 'sudo systemctl start docker', 'sudo systemctl enable docker'] + for cmd in cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + time.sleep(3) + +def install_docker(phdl): + distro = detect_distro(phdl) + if distro == 'debian': + install_docker_on_ubuntu(phdl) + elif distro == 'rhel': + install_docker_on_rhel(phdl) + elif distro == 'suse': + install_docker_on_suse(phdl) def launch_docker_container( phdl, container_name, image, device_list=[], volume_dict={}, env_dict={}, network='host', diff --git a/lib/linux_utils.py b/lib/linux_utils.py index 4ab6f214..453dfb85 100644 --- a/lib/linux_utils.py +++ b/lib/linux_utils.py @@ -12,7 +12,8 @@ import rocm_plib from utils_lib import * - +import globals +log = globals.log @@ -857,3 +858,138 @@ def get_gpu_numa_dict( phdl ): print(gpu_numa_dict) return gpu_numa_dict + +#linux distro discovery and package management +def detect_distro(phdl): + """Detect Linux distro and return distro type""" + out_dict = phdl.exec('cat /etc/os-release') + for node, output in out_dict.items(): + if re.search('ubuntu|debian', output, re.I): + return 'debian' + elif re.search('rhel|centos|fedora|rocky|alma', output, re.I): + return 'rhel' + elif re.search('sles|suse', output, re.I): + return 'suse' + return 'debian' # Default fallback + +def get_package_manager_cmd(distro, action='install'): + #based on distro find correct cmds + commands = { + 'debian': { + 'update': 'sudo apt-get update -y', + 'install': 'sudo apt-get install -y', + 'remove': 'sudo apt-get remove -y', + }, + 'rhel': { + 'update': 'sudo dnf check-update || true', + 'install': 'sudo dnf install -y', + 'remove': 'sudo dnf remove -y', + }, + 'suse': { + 'update': 'sudo zypper refresh', + 'install': 'sudo zypper install -y', + 'remove': 'sudo zypper remove -y', + } + } + return commands.get(distro, commands['debian']).get(action, '') + +def translate_package_name(package, distro): + #some pkgs differ in naming across distro. use right ones + if distro == 'debian': + return package + + # Package name mappings for non-Debian distros + package_map = { + 'rhel': { + 'libgtest-dev': 'gtest-devel', + 'libpci-dev': 'pciutils-devel', + 'libpci3': 'pciutils', + 'libyaml-cpp-dev': 'yaml-cpp-devel', + 'libibverbs-dev': 'libibverbs-devel', + 'librdmacm-dev': 'librdmacm-devel', + 'libibumad-dev': 'libibumad-devel', + 'openmpi-bin': 'openmpi', + 'openmpi-common': 'openmpi', + 'libopenmpi-dev': 'openmpi-devel', + 'hipblaslt-dev': 'hipblaslt-devel', + 'ibverbs-providers': 'rdma-core', + 'build-essential': ['gcc', 'gcc-c++', 'make'], # Install separately as multiple packages + 'apt-transport-https': None, # Not needed + 'software-properties-common': None, # Not needed + }, + 'suse': { + 'libgtest-dev': 'gtest', + 'libpci-dev': 'pciutils-devel', + 'libpci3': 'pciutils', + 'libyaml-cpp-dev': 'libyaml-cpp-devel', + 'openmpi-bin': 'openmpi', + 'libopenmpi-dev': 'openmpi-devel', + 'build-essential': ['gcc', 'gcc-c++', 'make'] + } + } + + map_dict = package_map.get(distro, {}) + return map_dict.get(package, package) + +def map_packages(distro, packages): + #update package names and flatten if needed + result = [] + for pkg in packages: + translated = translate_package_name(pkg, distro) + + if translated is None: + # Skip packages not needed on this distro + continue + elif isinstance(translated, list): + # Package expands to multiple packages (e.g., build-essential) + result.extend(translated) + else: + # Single package + result.append(translated) + + return result + +def install_package(hdl, package, distro=None, timeout=200): + # Install a package using the appropriate package manager for the detected distro. + if distro is None: + distro = detect_distro(hdl) + + # Translate package name for the distro + translated_pkg = translate_package_name(package, distro) + + # Skip if package is not needed on this distro + if translated_pkg is None: + log.info(f'Package {package} not needed on {distro}, skipping') + return {} + + # Get the install command + install_cmd = get_package_manager_cmd(distro, 'install') + + # Execute and return out_dict for error checking + out_dict = hdl.exec(f'{install_cmd} {translated_pkg}', timeout=timeout) + return out_dict + +def update_package_cache(hdl, distro=None, timeout=600): + """ + Update package manager cache and return out_dict for error checking. + + Returns: + out_dict: Command output dictionary for error checking + """ + if distro is None: + distro = detect_distro(hdl) + + update_cmd = get_package_manager_cmd(distro, 'update') + out_dict = hdl.exec(update_cmd, timeout=timeout) + return out_dict + +def install_build_tools(hdl, distro, timeout=200): + """Install build tools appropriate for the distro""" + if distro == 'debian': + return install_package(hdl, 'build-essential', distro, timeout) + elif distro in ['rhel', 'suse']: + results = {} + for pkg in ['gcc', 'gcc-c++', 'make']: + out_dict = install_package(hdl, pkg, distro, timeout) + results.update(out_dict) + return results \ No newline at end of file diff --git a/tests/health/install/install_babelstream.py b/tests/health/install/install_babelstream.py index f897651e..323cf187 100644 --- a/tests/health/install/install_babelstream.py +++ b/tests/health/install/install_babelstream.py @@ -20,7 +20,13 @@ sys.path.insert( 0, './lib' ) from parallel_ssh_lib import * from utils_lib import * - +from linux_utils import ( + detect_distro, + install_package, + update_package_cache, + translate_package_name, + map_packages +) import globals log = globals.log @@ -210,7 +216,7 @@ def test_install_babelstream( phdl, shdl, config_dict ): -def test_install_open_mpi(phdl, config_dict, ): +def test_install_open_mpi(phdl, shdl, config_dict, ): """ Install Open MPI across all nodes and verify that mpiexec is available. @@ -238,8 +244,24 @@ def test_install_open_mpi(phdl, config_dict, ): else: hdl = phdl path = config_dict['path'] - out_dict = phdl.exec(f'sudo apt update -y', timeout=200) - out_dict = phdl.exec(f'sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev', timeout=200) + #install via right package manager + distro = detect_distro(phdl) + print(f'Detected distro type: {distro}') + + out_dict = update_package_cache(hdl, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + # Install packages with error checking after each one + packages = ['openmpi-bin', 'openmpi-common', 'libopenmpi-dev'] + package_list = map_packages(distro, packages) + for package in package_list: + out_dict = install_package(hdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') + out_dict = phdl.exec('which mpiexec') for node in out_dict.keys(): if not re.search( 'mpiexec', out_dict[node] ): diff --git a/tests/health/install/install_rocblas.py b/tests/health/install/install_rocblas.py index f53ebf0a..e69c5fd1 100644 --- a/tests/health/install/install_rocblas.py +++ b/tests/health/install/install_rocblas.py @@ -10,12 +10,17 @@ import re import sys import os -import sys import time import json import logging -import time +from linux_utils import ( + detect_distro, + install_package, + update_package_cache, + translate_package_name, + map_packages +) sys.path.insert( 0, './lib' ) from parallel_ssh_lib import * @@ -198,12 +203,24 @@ def test_rocblas_install( phdl, shdl, config_dict, ): hdl.exec(f'sudo rm -rf {git_install_path}/rocBLAS') git_url = config_dict['git_url'] - out_dict = hdl.exec('sudo apt update -y', timeout=200) - out_dict = hdl.exec('sudo apt install -y libgtest-dev', timeout=200) - out_dict = hdl.exec('sudo apt install -y cmake', timeout=200) - out_dict = hdl.exec('sudo apt install -y gfortran', timeout=200) - out_dict = hdl.exec('sudo apt install -y hipblaslt-dev', timeout=200) + #install via right package manager + distro = detect_distro(phdl) + print(f'Detected distro type: {distro}') + out_dict = update_package_cache(hdl, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + # Install packages with error checking after each one + packages = ['libgtest-dev', 'cmake', 'gfortran', 'hipblaslt-dev'] + package_list = map_packages(distro, packages) + for package in package_list: + out_dict = install_package(hdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') + time.sleep(2) #out_dict = phdl.exec('git init') out_dict = hdl.exec(f'cd {git_install_path};git clone {git_url}', timeout=100 ) diff --git a/tests/health/install/install_rvs.py b/tests/health/install/install_rvs.py index c8949e37..1e584740 100644 --- a/tests/health/install/install_rvs.py +++ b/tests/health/install/install_rvs.py @@ -19,7 +19,13 @@ from parallel_ssh_lib import * from utils_lib import * from verify_lib import * - +from linux_utils import ( + detect_distro, + install_package, + update_package_cache, + translate_package_name, + map_packages +) import globals log = globals.log @@ -203,13 +209,23 @@ def test_install_rvs(phdl, shdl, config_dict): # If RVS is not found or configs are missing, install it if not rvs_found or not config_found: log.info('RVS not found, attempting to install from artifactory repo first') - # First try to install from artifactory repo package_installed = False - out_dict = hdl.exec('sudo apt-get update -y', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y libpci3 libpci-dev doxygen unzip cmake git libyaml-cpp-dev', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y rocblas rocm-smi-lib', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y rocm-validation-suite', timeout=600) + packages = ['libpci3', 'libpci-dev', 'doxygen', 'unzip', 'cmake', + 'git', 'libyaml-cpp-dev', 'rocblas', 'rocm-smi-lib'] + distro = detect_distro(hdl) + package_list = map_packages(distro, packages) + out_dict = update_package_cache(hdl, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + for package in package_list: + out_dict = install_package(hdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') + for node in out_dict.keys(): if re.search('Unable to locate package|Package.*not found|E: Could not get lock|dpkg: error', out_dict[node], re.I): diff --git a/tests/health/rocblas_cvs.py b/tests/health/rocblas_cvs.py index 0bc9950a..be547ad0 100644 --- a/tests/health/rocblas_cvs.py +++ b/tests/health/rocblas_cvs.py @@ -24,7 +24,12 @@ sys.path.insert( 0, './lib' ) from parallel_ssh_lib import * from utils_lib import * - +from linux_utils import ( + detect_distro, + install_package, + update_package_cache, + map_packages +) import globals log = globals.log @@ -127,11 +132,16 @@ def test_rocblas_install( hdl, phdl, config_dict, ): phdl.exec('sudo rm -rf /home/venksrin/rocBLAS') time.sleep(5) git_url = config_dict['git_url'] - out_dict = phdl.exec('sudo apt update -y', timeout=200) - out_dict = phdl.exec('sudo apt install -y libgtest-dev', timeout=200) - out_dict = phdl.exec('sudo apt install -y cmake', timeout=200) - out_dict = phdl.exec('sudo apt install -y gfortran', timeout=200) - time.sleep(3) + packages = ['libgtest-dev', 'cmake', 'gfortran'] + distro = detect_distro( phdl ) + log.info(f'Detected Distro : {distro}') + out_dict = update_package_cache( phdl, distro, timeout=300 ) + log.info(f'Updated package cache : {out_dict}') + package_list = map_packages( distro,packages ) + for pkg in package_list: + out_dict = install_package( phdl, pkg, distro, timeout=300 ) + log.info(f'Installed package {pkg} : {out_dict}') + log.info(out_dict) log.info(f'Inputs - {package_path}, {path}, {git_url}') print('%%%%%%%%%%%%%%%%%') diff --git a/tests/ibperf/install_ibperf_tools.py b/tests/ibperf/install_ibperf_tools.py index b3f5a584..0b97d847 100644 --- a/tests/ibperf/install_ibperf_tools.py +++ b/tests/ibperf/install_ibperf_tools.py @@ -23,7 +23,12 @@ from parallel_ssh_lib import * from utils_lib import * from verify_lib import * - +from linux_utils import ( + detect_distro, + install_package, + update_package_cache, + map_packages +) import globals log = globals.log @@ -211,12 +216,23 @@ def test_install_ib_perf(phdl, shdl, config_dict ): if re.search( 'true', config_dict['install_perf_package'], re.I ): shdl.exec( f'mkdir -p {config_dict["install_dir"]}') - phdl.exec( 'sudo apt update -y', timeout=200 ) - phdl.exec( 'sudo apt install -y git build-essential autoconf automake libtool pkg-config', timeout=200 ) - phdl.exec( 'sudo apt install -y libibverbs-dev librdmacm-dev ibverbs-providers rdma-core', timeout=200 ) - phdl.exec( 'sudo apt install -y libibumad-dev' ) - phdl.exec( 'sudo apt install -y libpci-dev' ) - phdl.exec( 'sudo apt install -y numactl' ) + distro = detect_distro(phdl) + out_dict = update_package_cache(phdl, distro, timeout=600) + # Check for errors if needed + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + packages = ['git', 'build-essential', 'autoconf', 'automake', 'libtool', + 'pkg-config', 'libibverbs-dev', 'librdmacm-dev', + 'ibverbs-providers', 'rdma-core', 'libibumad-dev', + 'libpci-dev', 'numactl'] + package_list = map_packages(distro, packages) + for package in package_list: + out_dict = install_package(phdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') shdl.exec( f'cd {config_dict["install_dir"]}; git clone https://github.com/linux-rdma/perftest' ) shdl.exec( f'cd {config_dict["install_dir"]}/perftest; ./autogen.sh', timeout=100 ) shdl.exec( f'cd {config_dict["install_dir"]}/perftest; ./configure --prefix={config_dict['install_dir']}/perftest --with-rocm={config_dict["rocm_dir"]} --enable-rocm', timeout=200 ) From 0e3e0eb6a98788ae90e625e77a5e6bee7540942c Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Tue, 18 Nov 2025 03:02:43 -0800 Subject: [PATCH 2/2] update info for rhel repo addition --- lib/docker_lib.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/docker_lib.py b/lib/docker_lib.py index f4e99a9b..8834d49d 100644 --- a/lib/docker_lib.py +++ b/lib/docker_lib.py @@ -112,6 +112,10 @@ def install_docker_on_ubuntu( phdl ): def install_docker_on_rhel(phdl): #Install Docker on RHEL/CentOS/Fedora + ''' + docker ce comes distributed from Docker Inc repo for centos for rhel/alma and centos. + hence we need to add the repo and then install via dnf, as there will be failures via default + ''' cmds = ['sudo dnf -y install dnf-plugins-core', 'sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo', 'sudo dnf -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin',