From b2e9cdf57ccda2d32f0569cc251941a985c0b084 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 17 Jan 2024 22:37:43 +0100
Subject: [PATCH] add bot/inspect.sh script

---
 bot/inspect.sh | 446 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 446 insertions(+)
 create mode 100755 bot/inspect.sh

diff --git a/bot/inspect.sh b/bot/inspect.sh
new file mode 100755
index 0000000000..9d1fa87e1f
--- /dev/null
+++ b/bot/inspect.sh
@@ -0,0 +1,446 @@
+#!/usr/bin/env bash
+#
+# Script to inspect result of a build job for the EESSI software layer.
+# Intended use is that it is called with a path to a job directory.
+#
+# This script is part of the EESSI software layer, see
+# https://github.com/EESSI/software-layer.git
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+
+# ASSUMPTIONs:
+#  - Script is executed on the same architecture the job was running on.
+#    - Initially, we also assume that is run on the same resource with the
+#      same (compute) node setup (local disk space, HTTP proxies, etc.)
+#  - The job directory being supplied has been prepared by the bot with a
+#    checkout of a pull request (OR by some other means)
+#  - The job directory contains a directory 'cfg' where the main config
+#    file 'job.cfg' has been deposited.
+#    - The 'cfg' directory may contain any additional files referenced in
+#      'job.cfg' (repos.cfg, etc.).
+#  - The job produced some tarballs for its state (tmp disk for overlayfs,
+#    CVMFS cache, etc.) under 'previous_tmp/{build,tarball}_step'.
+
+# stop as soon as something fails
+set -e
+
+display_help() {
+  echo "usage: $0 [OPTIONS]"
+  echo "  -h | --help            -  display this usage information"
+  echo "  -r | --resume TGZ      -  inspect job saved in tarball path TGZ; note, we assume the path"
+  echo "                            to be something like JOB_DIR/previous_tmp/{build,tarball}_step/TARBALL.tgz"
+  echo "                            and thus determine JOB_DIR from the given path"
+  echo "                            [default: none]"
+  echo "  -c | --command COMMAND -  command to execute inside the container, in the prefix environment"
+  echo "  -x | --http-proxy URL  -  provides URL for the environment variable http_proxy"
+  echo "  -y | --https-proxy URL -  provides URL for the environment variable https_proxy"
+}
+
+resume_tgz=
+http_proxy=
+https_proxy=
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case ${1} in
+    -h|--help)
+      display_help
+      exit 0
+      ;;
+    -r|--resume)
+      export resume_tgz="${2}"
+      shift 2
+      ;;
+    -x|--http-proxy)
+      export http_proxy="${2}"
+      shift 2
+      ;;
+    -y|--https-proxy)
+      export https_proxy="${2}"
+      shift 2
+      ;;
+    -c|--command)
+      export run_in_prefix="${2}"
+      shift 2
+      ;;
+    -*|--*)
+      echo "Error: Unknown option: ${1}" >&2
+      exit 1
+      ;;
+    *)  # No more options
+      POSITIONAL_ARGS+=("${1}") # save positional arg
+      shift
+      ;;
+  esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}"
+
+# source utils.sh and cfg_files.sh
+source scripts/utils.sh
+source scripts/cfg_files.sh
+
+if [[ -z ${resume_tgz} ]]; then
+    echo_red "path to tarball for resuming build job is missing"
+    display_help
+    exit 1
+fi
+
+job_dir=$(dirname $(dirname $(dirname ${resume_tgz})))
+
+if [[ -z ${job_dir} ]]; then
+    # job directory could be determined
+    echo_red "job directory could not be determined from '${resume_tgz}'"
+    display_help
+    exit 2
+fi
+
+# defaults
+export JOB_CFG_FILE="${job_dir}/cfg/job.cfg"
+HOST_ARCH=$(uname -m)
+
+# check if ${JOB_CFG_FILE} exists
+if [[ ! -r "${JOB_CFG_FILE}" ]]; then
+    fatal_error "job config file (JOB_CFG_FILE=${JOB_CFG_FILE}) does not exist or not readable"
+fi
+echo "bot/inspect.sh: showing ${JOB_CFG_FILE} from software-layer side"
+cat ${JOB_CFG_FILE}
+
+echo "bot/inspect.sh: obtaining configuration settings from '${JOB_CFG_FILE}'"
+cfg_load ${JOB_CFG_FILE}
+
+# if http_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $http_proxy
+HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy")
+HTTP_PROXY=${HTTP_PROXY:-${http_proxy}}
+echo "bot/inspect.sh: HTTP_PROXY='${HTTP_PROXY}'"
+
+# if https_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $https_proxy
+HTTPS_PROXY=$(cfg_get_value "site_config" "https_proxy")
+HTTPS_PROXY=${HTTPS_PROXY:-${https_proxy}}
+echo "bot/inspect.sh: HTTPS_PROXY='${HTTPS_PROXY}'"
+
+LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp")
+echo "bot/inspect.sh: LOCAL_TMP='${LOCAL_TMP}'"
+# TODO should local_tmp be mandatory? --> then we check here and exit if it is not provided
+
+# check if path to copy build logs to is specified, so we can copy build logs for failing builds there
+BUILD_LOGS_DIR=$(cfg_get_value "site_config" "build_logs_dir")
+echo "bot/inspect.sh: BUILD_LOGS_DIR='${BUILD_LOGS_DIR}'"
+# if $BUILD_LOGS_DIR is set, add it to $SINGULARITY_BIND so the path is available in the build container
+if [[ ! -z ${BUILD_LOGS_DIR} ]]; then
+    mkdir -p ${BUILD_LOGS_DIR}
+    if [[ -z ${SINGULARITY_BIND} ]]; then
+        export SINGULARITY_BIND="${BUILD_LOGS_DIR}"
+    else
+        export SINGULARITY_BIND="${SINGULARITY_BIND},${BUILD_LOGS_DIR}"
+    fi
+fi
+
+SINGULARITY_CACHEDIR=$(cfg_get_value "site_config" "container_cachedir")
+echo "bot/inspect.sh: SINGULARITY_CACHEDIR='${SINGULARITY_CACHEDIR}'"
+if [[ ! -z ${SINGULARITY_CACHEDIR} ]]; then
+    # make sure that separate directories are used for different CPU families
+    SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR}/${HOST_ARCH}
+    export SINGULARITY_CACHEDIR
+fi
+
+echo -n "setting \$STORAGE by replacing any var in '${LOCAL_TMP}' -> "
+# replace any env variable in ${LOCAL_TMP} with its
+#   current value (e.g., a value that is local to the job)
+STORAGE=$(envsubst <<< ${LOCAL_TMP})
+echo "'${STORAGE}'"
+
+# make sure ${STORAGE} exists
+mkdir -p ${STORAGE}
+
+# make sure the base tmp storage is unique
+JOB_STORAGE=$(mktemp --directory --tmpdir=${STORAGE} bot_job_tmp_XXX)
+echo "bot/inspect.sh: created unique base tmp storage directory at ${JOB_STORAGE}"
+
+# obtain list of modules to be loaded
+LOAD_MODULES=$(cfg_get_value "site_config" "load_modules")
+echo "bot/inspect.sh: LOAD_MODULES='${LOAD_MODULES}'"
+
+# singularity/apptainer settings: CONTAINER, HOME, TMPDIR, BIND
+CONTAINER=$(cfg_get_value "repository" "container")
+echo "bot/inspect.sh: CONTAINER='${CONTAINER}'"
+# instead of using ${PWD} as HOME in the container, we use the job directory
+# to have access to output files of the job
+export SINGULARITY_HOME="${job_dir}:/eessi_bot_job"
+echo "bot/inspect.sh: SINGULARITY_HOME='${SINGULARITY_HOME}'"
+export SINGULARITY_TMPDIR="${PWD}/singularity_tmpdir"
+echo "bot/inspect.sh: SINGULARITY_TMPDIR='${SINGULARITY_TMPDIR}'"
+mkdir -p ${SINGULARITY_TMPDIR}
+
+# load modules if LOAD_MODULES is not empty
+if [[ ! -z ${LOAD_MODULES} ]]; then
+    for mod in $(echo ${LOAD_MODULES} | tr ',' '\n')
+    do
+        echo "bot/inspect.sh: loading module '${mod}'"
+        module load ${mod}
+    done
+else
+    echo "bot/inspect.sh: no modules to be loaded"
+fi
+
+# determine repository to be used from entry .repository in ${JOB_CFG_FILE}
+REPOSITORY=$(cfg_get_value "repository" "repo_id")
+echo "bot/inspect.sh: REPOSITORY='${REPOSITORY}'"
+# TODO better to read this from tarball???
+EESSI_REPOS_CFG_DIR_OVERRIDE=$(cfg_get_value "repository" "repos_cfg_dir")
+export EESSI_REPOS_CFG_DIR_OVERRIDE=${EESSI_REPOS_CFG_DIR_OVERRIDE:-${PWD}/cfg}
+echo "bot/inspect.sh: EESSI_REPOS_CFG_DIR_OVERRIDE='${EESSI_REPOS_CFG_DIR_OVERRIDE}'"
+
+# determine EESSI version to be used from .repository.repo_version in ${JOB_CFG_FILE}
+# here, just set & export EESSI_VERSION_OVERRIDE
+# next script (eessi_container.sh) makes use of it via sourcing init scripts
+# (e.g., init/eessi_defaults or init/minimal_eessi_env)
+export EESSI_VERSION_OVERRIDE=$(cfg_get_value "repository" "repo_version")
+echo "bot/inspect.sh: EESSI_VERSION_OVERRIDE='${EESSI_VERSION_OVERRIDE}'"
+
+# determine CVMFS repo to be used from .repository.repo_name in ${JOB_CFG_FILE}
+# here, just set EESSI_CVMFS_REPO_OVERRIDE, a bit further down
+# "source init/eessi_defaults" via sourcing init/minimal_eessi_env
+export EESSI_CVMFS_REPO_OVERRIDE="/cvmfs/$(cfg_get_value 'repository' 'repo_name')"
+echo "bot/inspect.sh: EESSI_CVMFS_REPO_OVERRIDE='${EESSI_CVMFS_REPO_OVERRIDE}'"
+
+# determine architecture to be used from entry .architecture in ${JOB_CFG_FILE}
+# fallbacks:
+#  - ${CPU_TARGET} handed over from bot
+#  - left empty to let downstream script(s) determine subdir to be used
+EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(cfg_get_value "architecture" "software_subdir")
+EESSI_SOFTWARE_SUBDIR_OVERRIDE=${EESSI_SOFTWARE_SUBDIR_OVERRIDE:-${CPU_TARGET}}
+export EESSI_SOFTWARE_SUBDIR_OVERRIDE
+echo "bot/inspect.sh: EESSI_SOFTWARE_SUBDIR_OVERRIDE='${EESSI_SOFTWARE_SUBDIR_OVERRIDE}'"
+
+# get EESSI_OS_TYPE from .architecture.os_type in ${JOB_CFG_FILE} (default: linux)
+EESSI_OS_TYPE=$(cfg_get_value "architecture" "os_type")
+export EESSI_OS_TYPE=${EESSI_OS_TYPE:-linux}
+echo "bot/inspect.sh: EESSI_OS_TYPE='${EESSI_OS_TYPE}'"
+
+# prepare arguments to eessi_container.sh common to build and tarball steps
+declare -a CMDLINE_ARGS=()
+CMDLINE_ARGS+=("--verbose")
+CMDLINE_ARGS+=("--access" "rw")
+CMDLINE_ARGS+=("--mode" "run")
+[[ ! -z ${CONTAINER} ]] && CMDLINE_ARGS+=("--container" "${CONTAINER}")
+[[ ! -z ${HTTP_PROXY} ]] && CMDLINE_ARGS+=("--http-proxy" "${HTTP_PROXY}")
+[[ ! -z ${HTTPS_PROXY} ]] && CMDLINE_ARGS+=("--https-proxy" "${HTTPS_PROXY}")
+[[ ! -z ${REPOSITORY} ]] && CMDLINE_ARGS+=("--repository" "${REPOSITORY}")
+
+[[ ! -z ${resume_tgz} ]] && CMDLINE_ARGS+=("--resume" "${resume_tgz}")
+
+# create a directory for creating temporary data and scripts for the inspection
+INSPECT_DIR=$(mktemp --directory --tmpdir=${PWD} inspect.XXX)
+if [[ -z ${SINGULARITY_BIND} ]]; then
+    export SINGULARITY_BIND="${INSPECT_DIR}:/inspect_eessi_build_job"
+else
+    export SINGULARITY_BIND="${SINGULARITY_BIND},${INSPECT_DIR}:/inspect_eessi_build_job"
+fi
+
+# add arguments for temporary storage and storing a tarball of tmp
+CMDLINE_ARGS+=("--save" "${INSPECT_DIR}")
+CMDLINE_ARGS+=("--storage" "${JOB_STORAGE}")
+
+# # prepare arguments to install_software_layer.sh (specific to build step)
+# declare -a INSTALL_SCRIPT_ARGS=()
+# if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
+#     INSTALL_SCRIPT_ARGS+=("--generic")
+# fi
+# [[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
+
+# make sure some environment settings are available inside the shell started via
+# startprefix
+base_dir=$(dirname $(realpath $0))
+# base_dir of inspect.sh script is '.../bot', 'init' dir is at the same level
+# TODO better use script from tarball???
+source ${base_dir}/../init/eessi_defaults
+
+if [ -z $EESSI_VERSION ]; then
+    echo "ERROR: \$EESSI_VERSION must be set!" >&2
+    exit 1
+fi
+EESSI_COMPAT_LAYER_DIR="${EESSI_CVMFS_REPO}/versions/${EESSI_VERSION}/compat/linux/$(uname -m)"
+
+# NOTE The below requires access to the CVMFS repository. We could make a first
+# test run with a container. For now we skip the test.
+# if [ ! -d ${EESSI_COMPAT_LAYER_DIR} ]; then
+#     echo "ERROR: ${EESSI_COMPAT_LAYER_DIR} does not exist!" >&2
+#     exit 1
+# fi
+
+# When we want to run a script with arguments, the next line is ensures to retain
+# these arguments.
+# INPUT=$(echo "$@")
+mkdir -p ${INSPECT_DIR}/scripts
+RESUME_SCRIPT=${INSPECT_DIR}/scripts/resume_env.sh
+echo "bot/inspect.sh: creating script '${RESUME_SCRIPT}' to resume environment settings"
+
+cat << EOF > ${RESUME_SCRIPT}
+#!${EESSI_COMPAT_LAYER_DIR}/bin/bash
+echo "Sourcing '\$BASH_SOURCE' to init bot environment of build job"
+EOF
+if [ ! -z ${SLURM_JOB_ID} ]; then
+    # TODO do we need the value at all? if so which one: current or of the job to
+    # inspect?
+    echo "export CURRENT_SLURM_JOB_ID=${SLURM_JOB_ID}" >> ${RESUME_SCRIPT}
+fi
+if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then
+    echo "export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" >> ${RESUME_SCRIPT}
+fi
+if [ ! -z ${EESSI_CVMFS_REPO_OVERRIDE} ]; then
+    echo "export EESSI_CVMFS_REPO_OVERRIDE=${EESSI_CVMFS_REPO_OVERRIDE}" >> ${RESUME_SCRIPT}
+fi
+if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then
+    echo "export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}" >> ${RESUME_SCRIPT}
+fi
+if [ ! -z ${http_proxy} ]; then
+    echo "export http_proxy=${http_proxy}" >> ${RESUME_SCRIPT}
+fi
+if [ ! -z ${https_proxy} ]; then
+    echo "export https_proxy=${https_proxy}" >> ${RESUME_SCRIPT}
+fi
+cat << 'EOF' >> ${RESUME_SCRIPT}
+TOPDIR=$(dirname $(realpath $BASH_SOURCE))
+
+source ${TOPDIR}/scripts/utils.sh
+
+# honor $TMPDIR if it is already defined, use /tmp otherwise
+if [ -z $TMPDIR ]; then
+    export WORKDIR=/tmp/$USER
+else
+    export WORKDIR=$TMPDIR/$USER
+fi
+
+TMPDIR=$(mktemp -d)
+
+echo ">> Setting up environment..."
+
+source $TOPDIR/init/minimal_eessi_env
+
+if [ -d $EESSI_CVMFS_REPO ]; then
+    echo_green "$EESSI_CVMFS_REPO available, OK!"
+else
+    fatal_error "$EESSI_CVMFS_REPO is not available!"
+fi
+
+# make sure we're in Prefix environment by checking $SHELL
+if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then
+    echo_green ">> It looks like we're in a Gentoo Prefix environment, good!"
+else
+    fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!"
+fi
+
+# avoid that pyc files for EasyBuild are stored in EasyBuild installation directory
+export PYTHONPYCACHEPREFIX=$TMPDIR/pycache
+
+DETECTION_PARAMETERS=''
+GENERIC=0
+EB='eb'
+if [[ "$EASYBUILD_OPTARCH" == "GENERIC" || "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" == *"/generic" ]]; then
+    echo_yellow ">> GENERIC build requested, taking appropriate measures!"
+    DETECTION_PARAMETERS="$DETECTION_PARAMETERS --generic"
+    GENERIC=1
+    export EASYBUILD_OPTARCH=GENERIC
+    EB='eb --optarch=GENERIC'
+fi
+
+echo ">> Determining software subdirectory to use for current build host..."
+if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then
+  export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
+  echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script"
+else
+  echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}"
+fi
+
+# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE)
+# $EESSI_SILENT - don't print any messages
+# $EESSI_BASIC_ENV - give a basic set of environment variables
+EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables
+
+if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then
+    fatal_error "Failed to determine software subdirectory?!"
+elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then
+    fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!"
+else
+    echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!"
+fi
+
+echo ">> Initializing Lmod..."
+source $EPREFIX/usr/share/Lmod/init/bash
+ml_version_out=$TMPDIR/ml.out
+ml --version &> $ml_version_out
+if [[ $? -eq 0 ]]; then
+    echo_green ">> Found Lmod ${LMOD_VERSION}"
+else
+    fatal_error "Failed to initialize Lmod?! (see output in ${ml_version_out}"
+fi
+
+echo ">> Configuring EasyBuild..."
+source $TOPDIR/configure_easybuild
+
+echo ">> Setting up \$MODULEPATH..."
+# make sure no modules are loaded
+module --force purge
+# ignore current $MODULEPATH entirely
+module unuse $MODULEPATH
+module use $EASYBUILD_INSTALLPATH/modules/all
+if [[ -z ${MODULEPATH} ]]; then
+    fatal_error "Failed to set up \$MODULEPATH?!"
+else
+    echo_green ">> MODULEPATH set up: ${MODULEPATH}"
+fi
+
+echo
+echo_green "Build environment set up with install path ${EASYBUILD_INSTALLPATH}."
+echo
+echo "The build job can be inspected with the following resources:"
+echo " - job directory is $HOME (\$HOME), check for slurm-*.out file"
+echo " - temporary data of the job is available at /tmp"
+echo " - note, the prefix $EESSI_PREFIX is writable"
+echo
+echo "You may want to load an EasyBuild module. The inspect.sh script does not load"
+echo "that automatically, because multiple versions might have been used by the job."
+echo "Choose an EasyBuild version (see installed versions with 'module avail EasyBuild')"
+echo "and simply run"
+echo
+echo "module load EasyBuild/_VERSION_"
+echo
+echo "Replace _VERSION_ with the version you want to use."
+echo
+
+EOF
+chmod u+x ${RESUME_SCRIPT}
+
+# try to map it into the container's $HOME/.profile instead
+# TODO check if script already exists, if so change its name and source it at the beginning of the RESUME_SCRIPT
+if [[ -z ${SINGULARITY_BIND} ]]; then
+    export SINGULARITY_BIND="${RESUME_SCRIPT}:/eessi_bot_job/.profile"
+else
+    export SINGULARITY_BIND="${SINGULARITY_BIND},${RESUME_SCRIPT}:/eessi_bot_job/.profile"
+fi
+
+echo "Executing command to start interactive session to inspect build job:"
+# TODO possibly add information on how to init session after the prefix is
+# entered, initialization consists of
+# - environment variable settings (see 'run_in_compat_layer_env.sh')
+# - setup steps run in 'EESSI-install-software.sh'
+# These initializations are combined into a single script that is executed when
+# the shell in startprefix is started. We set the env variable BASH_ENV here.
+if [[ -z ${run_in_prefix} ]]; then
+    echo "./eessi_container.sh ${CMDLINE_ARGS[@]}"
+    echo "                     -- ${EESSI_COMPAT_LAYER_DIR}/startprefix"
+    ./eessi_container.sh "${CMDLINE_ARGS[@]}" \
+                     -- ${EESSI_COMPAT_LAYER_DIR}/startprefix
+else
+    echo "./eessi_container.sh ${CMDLINE_ARGS[@]}"
+    echo "                     -- ${EESSI_COMPAT_LAYER_DIR}/startprefix <<< ${run_in_prefix}"
+    ./eessi_container.sh "${CMDLINE_ARGS[@]}" \
+                     -- ${EESSI_COMPAT_LAYER_DIR}/startprefix <<< ${run_in_prefix}
+fi
+
+exit 0