Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 62 additions & 16 deletions lib/docker_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from utils_lib import *
from verify_lib import *

from linux_utils import detect_distro


def get_running_docker_containers(phdl):
Expand Down Expand Up @@ -86,21 +86,67 @@ def old_install_docker_on_ubuntu( phdl ):


def install_docker_on_ubuntu( phdl ):
phdl.exec('sudo rm /etc/apt/keyrings/docker.gpg')
phdl.exec('sudo rm /etc/apt/sources.list.d/docker.list')
phdl.exec('sudo apt-get -y update')
phdl.exec('sudo apt install -y apt-transport-https ca-certificates curl software-properties-common')
phdl.exec('curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -')
phdl.exec('sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"')
phdl.exec('apt-cache policy docker-ce')
phdl.exec('sudo apt install -y docker-ce')
time.sleep(3)
phdl.exec('sudo systemctl start docker')
time.sleep(3)
phdl.exec('sudo systemctl status docker')



cmds = ['sudo rm /etc/apt/keyrings/docker.gpg',
'sudo rm /etc/apt/sources.list.d/docker.list',
'sudo apt-get -y update',
'sudo apt install -y apt-transport-https ca-certificates curl software-properties-common',
'curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -',
'sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable"',
'apt-cache policy docker-ce',
'sudo apt install -y docker-ce']
for cmd in cmds:
out_dict = phdl.exec(cmd)
for node in out_dict.keys():
if re.search( 'error|fail', out_dict[node], re.I ):
fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs')

sysctl_cmds = ['sudo systemctl start docker',
'sudo systemctl enable docker']
for cmd in sysctl_cmds:
out_dict = phdl.exec(cmd)
for node in out_dict.keys():
if re.search( 'error|fail', out_dict[node], re.I ):
fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs')
time.sleep(3)


def install_docker_on_rhel(phdl):
#Install Docker on RHEL/CentOS/Fedora
'''
docker ce comes distributed from Docker Inc repo for centos for rhel/alma and centos.
hence we need to add the repo and then install via dnf, as there will be failures via default
'''
cmds = ['sudo dnf -y install dnf-plugins-core',
'sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo',
'sudo dnf -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin',
'sudo systemctl start docker',
'sudo systemctl enable docker']
for cmd in cmds:
out_dict = phdl.exec(cmd)
for node in out_dict.keys():
if re.search( 'error|fail', out_dict[node], re.I ):
fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs')
time.sleep(3)

def install_docker_on_suse(phdl):
#Install Docker on SLES
cmds = ['sudo zypper refresh', 'sudo zypper -n install docker',
'sudo systemctl start docker', 'sudo systemctl enable docker']
for cmd in cmds:
out_dict = phdl.exec(cmd)
for node in out_dict.keys():
if re.search( 'error|fail', out_dict[node], re.I ):
fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs')
time.sleep(3)

def install_docker(phdl):
distro = detect_distro(phdl)
if distro == 'debian':
install_docker_on_ubuntu(phdl)
elif distro == 'rhel':
install_docker_on_rhel(phdl)
elif distro == 'suse':
install_docker_on_suse(phdl)

def launch_docker_container( phdl, container_name, image, device_list=[], volume_dict={},
env_dict={}, network='host',
Expand Down
138 changes: 137 additions & 1 deletion lib/linux_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import rocm_plib

from utils_lib import *

import globals
log = globals.log



Expand Down Expand Up @@ -857,3 +858,138 @@ def get_gpu_numa_dict( phdl ):

print(gpu_numa_dict)
return gpu_numa_dict

#linux distro discovery and package management
def detect_distro(phdl):
"""Detect Linux distro and return distro type"""
out_dict = phdl.exec('cat /etc/os-release')
for node, output in out_dict.items():
if re.search('ubuntu|debian', output, re.I):
return 'debian'
elif re.search('rhel|centos|fedora|rocky|alma', output, re.I):
return 'rhel'
elif re.search('sles|suse', output, re.I):
return 'suse'
return 'debian' # Default fallback

def get_package_manager_cmd(distro, action='install'):
#based on distro find correct cmds
commands = {
'debian': {
'update': 'sudo apt-get update -y',
'install': 'sudo apt-get install -y',
'remove': 'sudo apt-get remove -y',
},
'rhel': {
'update': 'sudo dnf check-update || true',
'install': 'sudo dnf install -y',
'remove': 'sudo dnf remove -y',
},
'suse': {
'update': 'sudo zypper refresh',
'install': 'sudo zypper install -y',
'remove': 'sudo zypper remove -y',
}
}
return commands.get(distro, commands['debian']).get(action, '')

def translate_package_name(package, distro):
#some pkgs differ in naming across distro. use right ones
if distro == 'debian':
return package

# Package name mappings for non-Debian distros
package_map = {
'rhel': {
'libgtest-dev': 'gtest-devel',
'libpci-dev': 'pciutils-devel',
'libpci3': 'pciutils',
'libyaml-cpp-dev': 'yaml-cpp-devel',
'libibverbs-dev': 'libibverbs-devel',
'librdmacm-dev': 'librdmacm-devel',
'libibumad-dev': 'libibumad-devel',
'openmpi-bin': 'openmpi',
'openmpi-common': 'openmpi',
'libopenmpi-dev': 'openmpi-devel',
'hipblaslt-dev': 'hipblaslt-devel',
'ibverbs-providers': 'rdma-core',
'build-essential': ['gcc', 'gcc-c++', 'make'], # Install separately as multiple packages
'apt-transport-https': None, # Not needed
'software-properties-common': None, # Not needed
},
'suse': {
'libgtest-dev': 'gtest',
'libpci-dev': 'pciutils-devel',
'libpci3': 'pciutils',
'libyaml-cpp-dev': 'libyaml-cpp-devel',
'openmpi-bin': 'openmpi',
'libopenmpi-dev': 'openmpi-devel',
'build-essential': ['gcc', 'gcc-c++', 'make']
}
}

map_dict = package_map.get(distro, {})
return map_dict.get(package, package)

def map_packages(distro, packages):
#update package names and flatten if needed
result = []
for pkg in packages:
translated = translate_package_name(pkg, distro)

if translated is None:
# Skip packages not needed on this distro
continue
elif isinstance(translated, list):
# Package expands to multiple packages (e.g., build-essential)
result.extend(translated)
else:
# Single package
result.append(translated)

return result

def install_package(hdl, package, distro=None, timeout=200):
# Install a package using the appropriate package manager for the detected distro.
if distro is None:
distro = detect_distro(hdl)

# Translate package name for the distro
translated_pkg = translate_package_name(package, distro)

# Skip if package is not needed on this distro
if translated_pkg is None:
log.info(f'Package {package} not needed on {distro}, skipping')
return {}

# Get the install command
install_cmd = get_package_manager_cmd(distro, 'install')

# Execute and return out_dict for error checking
out_dict = hdl.exec(f'{install_cmd} {translated_pkg}', timeout=timeout)
return out_dict

def update_package_cache(hdl, distro=None, timeout=600):
"""
Update package manager cache and return out_dict for error checking.

Returns:
out_dict: Command output dictionary for error checking
"""
if distro is None:
distro = detect_distro(hdl)

update_cmd = get_package_manager_cmd(distro, 'update')
out_dict = hdl.exec(update_cmd, timeout=timeout)
return out_dict

def install_build_tools(hdl, distro, timeout=200):
"""Install build tools appropriate for the distro"""
if distro == 'debian':
return install_package(hdl, 'build-essential', distro, timeout)
elif distro in ['rhel', 'suse']:
results = {}
for pkg in ['gcc', 'gcc-c++', 'make']:
out_dict = install_package(hdl, pkg, distro, timeout)
results.update(out_dict)
return results
30 changes: 26 additions & 4 deletions tests/health/install/install_babelstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
sys.path.insert( 0, './lib' )
from parallel_ssh_lib import *
from utils_lib import *

from linux_utils import (
detect_distro,
install_package,
update_package_cache,
translate_package_name,
map_packages
)
import globals

log = globals.log
Expand Down Expand Up @@ -210,7 +216,7 @@ def test_install_babelstream( phdl, shdl, config_dict ):



def test_install_open_mpi(phdl, config_dict, ):
def test_install_open_mpi(phdl, shdl, config_dict, ):
"""
Install Open MPI across all nodes and verify that mpiexec is available.

Expand Down Expand Up @@ -238,8 +244,24 @@ def test_install_open_mpi(phdl, config_dict, ):
else:
hdl = phdl
path = config_dict['path']
out_dict = phdl.exec(f'sudo apt update -y', timeout=200)
out_dict = phdl.exec(f'sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev', timeout=200)
#install via right package manager
distro = detect_distro(phdl)
print(f'Detected distro type: {distro}')

out_dict = update_package_cache(hdl, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed', out_dict[node], re.I):
log.warning(f'Package update warning on {node}')

# Install packages with error checking after each one
packages = ['openmpi-bin', 'openmpi-common', 'libopenmpi-dev']
package_list = map_packages(distro, packages)
for package in package_list:
out_dict = install_package(hdl, package, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed|unable to locate', out_dict[node], re.I):
fail_test(f'Failed to install {package} on {node}')

out_dict = phdl.exec('which mpiexec')
for node in out_dict.keys():
if not re.search( 'mpiexec', out_dict[node] ):
Expand Down
31 changes: 24 additions & 7 deletions tests/health/install/install_rocblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@
import re
import sys
import os
import sys
import time
import json
import logging
import time

from linux_utils import (
detect_distro,
install_package,
update_package_cache,
translate_package_name,
map_packages
)

sys.path.insert( 0, './lib' )
from parallel_ssh_lib import *
Expand Down Expand Up @@ -198,12 +203,24 @@ def test_rocblas_install( phdl, shdl, config_dict, ):
hdl.exec(f'sudo rm -rf {git_install_path}/rocBLAS')

git_url = config_dict['git_url']
out_dict = hdl.exec('sudo apt update -y', timeout=200)
out_dict = hdl.exec('sudo apt install -y libgtest-dev', timeout=200)
out_dict = hdl.exec('sudo apt install -y cmake', timeout=200)
out_dict = hdl.exec('sudo apt install -y gfortran', timeout=200)
out_dict = hdl.exec('sudo apt install -y hipblaslt-dev', timeout=200)
#install via right package manager
distro = detect_distro(phdl)
print(f'Detected distro type: {distro}')

out_dict = update_package_cache(hdl, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed', out_dict[node], re.I):
log.warning(f'Package update warning on {node}')

# Install packages with error checking after each one
packages = ['libgtest-dev', 'cmake', 'gfortran', 'hipblaslt-dev']
package_list = map_packages(distro, packages)
for package in package_list:
out_dict = install_package(hdl, package, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed|unable to locate', out_dict[node], re.I):
fail_test(f'Failed to install {package} on {node}')

time.sleep(2)
#out_dict = phdl.exec('git init')
out_dict = hdl.exec(f'cd {git_install_path};git clone {git_url}', timeout=100 )
Expand Down
28 changes: 22 additions & 6 deletions tests/health/install/install_rvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
from parallel_ssh_lib import *
from utils_lib import *
from verify_lib import *

from linux_utils import (
detect_distro,
install_package,
update_package_cache,
translate_package_name,
map_packages
)
import globals

log = globals.log
Expand Down Expand Up @@ -203,13 +209,23 @@ def test_install_rvs(phdl, shdl, config_dict):
# If RVS is not found or configs are missing, install it
if not rvs_found or not config_found:
log.info('RVS not found, attempting to install from artifactory repo first')

# First try to install from artifactory repo
package_installed = False
out_dict = hdl.exec('sudo apt-get update -y', timeout=600)
out_dict = hdl.exec('sudo apt-get install -y libpci3 libpci-dev doxygen unzip cmake git libyaml-cpp-dev', timeout=600)
out_dict = hdl.exec('sudo apt-get install -y rocblas rocm-smi-lib', timeout=600)
out_dict = hdl.exec('sudo apt-get install -y rocm-validation-suite', timeout=600)
packages = ['libpci3', 'libpci-dev', 'doxygen', 'unzip', 'cmake',
'git', 'libyaml-cpp-dev', 'rocblas', 'rocm-smi-lib']
distro = detect_distro(hdl)
package_list = map_packages(distro, packages)
out_dict = update_package_cache(hdl, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed', out_dict[node], re.I):
log.warning(f'Package update warning on {node}')

for package in package_list:
out_dict = install_package(hdl, package, distro, timeout=200)
for node in out_dict.keys():
if re.search('error|failed|unable to locate', out_dict[node], re.I):
fail_test(f'Failed to install {package} on {node}')


for node in out_dict.keys():
if re.search('Unable to locate package|Package.*not found|E: Could not get lock|dpkg: error', out_dict[node], re.I):
Expand Down
Loading