Skip to content

Commit

Permalink
Merge pull request #293 from TopRichard/EESSI-pr-473
Browse files Browse the repository at this point in the history
add Lmod hook to set  to ^smcuda when loading OpenMPI module to work around bug + renaming
  • Loading branch information
trz42 committed Apr 5, 2024
2 parents bc265dc + 296f00c commit dbdf9ae
Showing 1 changed file with 29 additions and 8 deletions.
37 changes: 29 additions & 8 deletions create_lmodrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
-- to load the CUDA module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
-- get the full host_injections path
Expand All @@ -44,26 +44,26 @@
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
Expand All @@ -85,7 +85,7 @@
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. "let NESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
Expand All @@ -94,7 +94,28 @@
end
end
local function openmpi_load_hook(t)
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
-- to work around hang/crash due to bug in OpenMPI;
-- see https://gitlab.com/eessi/support/-/issues/41
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local moduleName = string.match(t.modFullName, "(.-)/")
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
if ompiMcaBtl == nil then
setenv("OMPI_MCA_btl", "^smcuda")
else
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
end
end
end
hook.register("load", cuda_enabled_load_hook)
hook.register("load", openmpi_load_hook)
"""

def error(msg):
Expand Down

0 comments on commit dbdf9ae

Please sign in to comment.