Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

functionality for rebuilding software: try it on OpenMPI 4.1.x to fix smcuda issue #311

Merged
merged 8 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,29 +204,34 @@ ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12

# use PR patch file to determine in which easystack files stuff was added
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
if [ -z ${changed_easystacks} ]; then
if [ -z "${changed_easystacks}" ]; then
echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here
else
for easystack_file in ${changed_easystacks}; do


# first process rebuilds, if any, then easystack files for new installations
# "|| true" is used to make sure that the grep command always returns success
rebuild_easystacks=$(echo "${changed_easystacks}" | (grep "/rebuilds/" || true))
new_easystacks=$(echo "${changed_easystacks}" | (grep -v "/rebuilds/" || true))
for easystack_file in ${rebuild_easystacks} ${new_easystacks}; do

echo -e "Processing easystack file ${easystack_file}...\n\n"

# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

${EB} --show-config

echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..."

if [ -f ${easystack_file} ]; then
echo_green "Feeding easystack file ${easystack_file} to EasyBuild..."

${EB} --easystack ${TOPDIR}/${easystack_file} --robot
ec=$?

# copy EasyBuild log file if EasyBuild exited with an error
if [ ${ec} -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
Expand All @@ -236,12 +241,12 @@ else
# copy to build logs dir (with context added)
copy_build_log "${eb_last_log}" "${build_logs_dir}"
fi

$TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file} ${TOPDIR}/${pr_diff}
else
fatal_error "Easystack file ${easystack_file} not found!"
fi

done
fi

Expand Down
125 changes: 125 additions & 0 deletions EESSI-remove-software.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/bin/bash
#
# Script to remove part of the EESSI software stack (version set through init/eessi_defaults)

# see example parsing of command line arguments at
# https://wiki.bash-hackers.org/scripting/posparams#using_a_while_loop
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash

display_help() {
echo "usage: $0 [OPTIONS]"
echo " -g | --generic - instructs script to build for generic architecture target"
echo " -h | --help - display this usage information"
}

POSITIONAL_ARGS=()

while [[ $# -gt 0 ]]; do
case $1 in
-g|--generic)
DETECTION_PARAMETERS="--generic"
shift
;;
-h|--help)
display_help # Call your function
# no shifting needed here, we're done.
exit 0
;;
-*|--*)
echo "Error: Unknown option: $1" >&2
exit 1
;;
*) # No more options
POSITIONAL_ARGS+=("$1") # save positional arg
shift
;;
esac
done

set -- "${POSITIONAL_ARGS[@]}"

TOPDIR=$(dirname $(realpath $0))

export TMPDIR=$(mktemp -d /tmp/eessi-remove.XXXXXXXX)

source $TOPDIR/scripts/utils.sh

echo ">> Determining software subdirectory to use for current build host..."
if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script"
else
echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}"
fi

echo ">> Setting up environment..."

source $TOPDIR/init/bash

if [ -d $EESSI_CVMFS_REPO ]; then
echo_green "$EESSI_CVMFS_REPO available, OK!"
else
fatal_error "$EESSI_CVMFS_REPO is not available!"
fi

if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then
fatal_error "Failed to determine software subdirectory?!"
elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then
fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!"
else
echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!"
fi

echo ">> Configuring EasyBuild..."
EB="eb"
source $TOPDIR/configure_easybuild

echo ">> Setting up \$MODULEPATH..."
# make sure no modules are loaded
module --force purge
# ignore current $MODULEPATH entirely
module unuse $MODULEPATH
module use $EASYBUILD_INSTALLPATH/modules/all
if [[ -z ${MODULEPATH} ]]; then
fatal_error "Failed to set up \$MODULEPATH?!"
else
echo_green ">> MODULEPATH set up: ${MODULEPATH}"
fi

# assume there's only one diff file that corresponds to the PR patch file
pr_diff=$(ls [0-9]*.diff | head -1)

# if this script is run as root, use PR patch file to determine if software needs to be removed first
if [ $EUID -eq 0 ]; then
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing' | grep "/rebuilds/")
if [ -z ${changed_easystacks_rebuilds} ]; then
echo "No software needs to be removed."
else
for easystack_file in ${changed_easystacks_rebuilds}; do
# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

if [ -f ${easystack_file} ]; then
echo_green "Software rebuild(s) requested in ${easystack_file}, so determining which existing installation have to be removed..."
# we need to remove existing installation directories first,
# so let's figure out which modules have to be rebuilt by doing a dry-run and grepping "someapp/someversion" for the relevant lines (with [R])
# * [R] $CFGS/s/someapp/someapp-someversion.eb (module: someapp/someversion)
rebuild_apps=$(eb --allow-use-as-root-and-accept-consequences --dry-run-short --rebuild --easystack ${easystack_file} | grep "^ \* \[R\]" | grep -o "module: .*[^)]" | awk '{print $2}')
for app in ${rebuild_apps}; do
app_dir=${EASYBUILD_INSTALLPATH}/software/${app}
app_module=${EASYBUILD_INSTALLPATH}/modules/all/${app}.lua
echo_yellow "Removing ${app_dir} and ${app_module}..."
rm -rf ${app_dir}
rm -rf ${app_module}
done
else
fatal_error "Easystack file ${easystack_file} not found!"
fi
done
fi
else
fatal_error "This script can only be run by root!"
fi
66 changes: 55 additions & 11 deletions bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,58 @@ COMMON_ARGS+=("--mode" "run")
# make sure to use the same parent dir for storing tarballs of tmp
PREVIOUS_TMP_DIR=${PWD}/previous_tmp

# prepare arguments to install_software_layer.sh (specific to build step)
declare -a BUILD_STEP_ARGS=()
declare -a INSTALL_SCRIPT_ARGS=()
declare -a REMOVAL_SCRIPT_ARGS=()
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
INSTALL_SCRIPT_ARGS+=("--generic")
REMOVAL_SCRIPT_ARGS+=("--generic")
fi
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")

# determine if the removal step has to be run
# assume there's only one diff file that corresponds to the PR patch file
pr_diff=$(ls [0-9]*.diff | head -1)
# the true at the end of the next command is important: grep will expectedly return 1 if there is no easystack file being added under rebuilds,
# but due to "set -e" the entire script would otherwise fail
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | (grep "/rebuilds/" || true))
if [[ -z "${changed_easystacks_rebuilds}" ]]; then
echo "This PR does not add any easystack files in a rebuilds subdirectory, so let's skip the removal step."
else
# prepare directory to store tarball of tmp for removal and build steps
TARBALL_TMP_REMOVAL_STEP_DIR=${PREVIOUS_TMP_DIR}/removal_step
mkdir -p ${TARBALL_TMP_REMOVAL_STEP_DIR}

# prepare arguments to eessi_container.sh specific to remove step
declare -a REMOVAL_STEP_ARGS=()
REMOVAL_STEP_ARGS+=("--save" "${TARBALL_TMP_REMOVAL_STEP_DIR}")
REMOVAL_STEP_ARGS+=("--storage" "${STORAGE}")
# add fakeroot option in order to be able to remove software, see:
# https://github.com/EESSI/software-layer/issues/312
REMOVAL_STEP_ARGS+=("--fakeroot")

# create tmp file for output of removal step
removal_outerr=$(mktemp remove.outerr.XXXX)

echo "Executing command to remove software:"
echo "./eessi_container.sh ${COMMON_ARGS[@]} ${REMOVAL_STEP_ARGS[@]}"
echo " -- ./EESSI-remove-software.sh \"${REMOVAL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${removal_outerr}"
./eessi_container.sh "${COMMON_ARGS[@]}" "${REMOVAL_STEP_ARGS[@]}" \
-- ./EESSI-remove-software.sh "${REMOVAL_SCRIPT_ARGS[@]}" "$@" 2>&1 | tee -a ${removal_outerr}

# make sure that the build step resumes from the same temporary directory
# this is important, as otherwise the removed software will still be there
REMOVAL_TMPDIR=$(grep ' as tmp directory ' ${removal_outerr} | cut -d ' ' -f 2)
BUILD_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")
fi

# prepare directory to store tarball of tmp for build step
TARBALL_TMP_BUILD_STEP_DIR=${PREVIOUS_TMP_DIR}/build_step
mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}

# prepare arguments to eessi_container.sh specific to build step
declare -a BUILD_STEP_ARGS=()
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
# add options required to handle NVIDIA support
Expand All @@ -182,14 +228,6 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
fi

# prepare arguments to install_software_layer.sh (specific to build step)
declare -a INSTALL_SCRIPT_ARGS=()
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
INSTALL_SCRIPT_ARGS+=("--generic")
fi
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")

# create tmp file for output of build step
build_outerr=$(mktemp build.outerr.XXXX)

Expand All @@ -211,8 +249,14 @@ declare -a TARBALL_STEP_ARGS=()
TARBALL_STEP_ARGS+=("--save" "${TARBALL_TMP_TARBALL_STEP_DIR}")

# determine temporary directory to resume from
BUILD_TMPDIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2)
TARBALL_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}")
if [[ -z ${REMOVAL_TMPDIR} ]]; then
# no rebuild step was done, so the tarball step should resume from the build directory
BUILD_TMPDIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2)
TARBALL_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}")
else
# a removal step was done, so resume from its temporary directory (which was also used for the build step)
TARBALL_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")
fi

timestamp=$(date +%s)
# to set EESSI_VERSION we need to source init/eessi_defaults now
Expand Down
21 changes: 0 additions & 21 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,31 +84,10 @@
end
end

local function eessi_openmpi_load_hook(t)
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
-- to work around hang/crash due to bug in OpenMPI;
-- see https://gitlab.com/eessi/support/-/issues/41
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local moduleName = string.match(t.modFullName, "(.-)/")
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
if ompiMcaBtl == nil then
setenv("OMPI_MCA_btl", "^smcuda")
else
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
end
end
end

-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
eessi_cuda_enabled_load_hook(t)
eessi_openmpi_load_hook(t)
end


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# 2024-04-09
# Rebuild all OpenMPI 4.1.x versions due to an issue with smcuda:
# https://github.com/open-mpi/ompi/issues/12270
# https://github.com/open-mpi/ompi/pull/12344
# https://github.com/easybuilders/easybuild-easyconfigs/pull/19940
easyconfigs:
- OpenMPI-4.1.4-GCC-12.2.0.eb:
options:
from-pr: 19940
- OpenMPI-4.1.5-GCC-12.3.0:
options:
from-pr: 19940
- OpenMPI-4.1.6-GCC-13.2.0:
options:
from-pr: 19940
11 changes: 11 additions & 0 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ display_help() {
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -f | --fakeroot - run the container with --fakeroot [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
Expand Down Expand Up @@ -113,6 +114,7 @@ display_help() {
ACCESS="ro"
CONTAINER="docker://ghcr.io/eessi/build-node:debian11"
#DRY_RUN=0
FAKEROOT=0
VERBOSE=0
STORAGE=
LIST_REPOS=0
Expand Down Expand Up @@ -140,6 +142,10 @@ while [[ $# -gt 0 ]]; do
# DRY_RUN=1
# shift 1
# ;;
-f|--fakeroot)
FAKEROOT=1
shift 1
;;
-g|--storage)
STORAGE="$2"
shift 2
Expand Down Expand Up @@ -466,6 +472,11 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
fi
fi

# Configure the fakeroot setting for the container
if [[ ${FAKEROOT} -eq 1 ]]; then
ADDITIONAL_CONTAINER_OPTIONS+=("--fakeroot")
fi

# set up repository config (always create directory repos_cfg and populate it with info when
# arg -r|--repository is used)
mkdir -p ${EESSI_TMPDIR}/repos_cfg
Expand Down
Loading