Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Andes at OLCF: NEXUS and build scripts #3073

Merged
merged 7 commits into from
Apr 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions config/build_olcf_andes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

BUILD_MODULES=config/load_olcf_andes_modules.sh

module purge
echo "Purging current module set"
echo "Sourcing file: $BUILD_MODULES to build QMCPACK"

. $BUILD_MODULES

echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK"

export BLAS_LIBS="-L$OLCF_OPENBLAS_ROOT/lib -lopenblas"
export LAPACK_LIBS="$BLAS_LIBS $OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.a"

declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \
["complex_cpu"]="-DQMC_COMPLEX=1" \
# ["legacy_gpu"]="-DQMC_CUDA=1 " \
# ["complex_legacy_gpu"]="-DQMC_CUDA=1 -DQMC_COMPLEX=1 " \
)

mkdir bin_andes

for build in "${!builds[@]}"
do
echo "building: $build with ${builds[$build]}"
rm bin_andes/qmcpack_${build}
mkdir build_andes_${build}
cd build_andes_${build}
cmake -DCMAKE_C_COMPILER="mpicc" \
-DCMAKE_CXX_COMPILER="mpicxx" \
-DBUILD_LMYENGINE_INTERFACE=0 \
${builds[$build]} \
..
make -j 20
if [ $? -eq 0 ]; then
build_dir=$(pwd)
ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin_andes/qmcpack_${build}
fi
cd ..
done

19 changes: 19 additions & 0 deletions config/load_olcf_andes_modules.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
echo "Loading QMCPACK dependency modules for andes"
echo "https://docs.olcf.ornl.gov/systems/andes_user_guide.html"
echo
module load gcc/9.3.0
#module load intel/19.0.3
module load openmpi/4.0.4
#module load essl
module load openblas/0.3.12
module load netlib-lapack
#module load netlib-scalapack
module load hdf5
module load fftw
export FFTW_ROOT=$OLCF_FFTW_ROOT
module load cmake/3.18.4
module load boost/1.74.0
#module load cuda
module load python/3.7-anaconda3

158 changes: 158 additions & 0 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3145,6 +3145,162 @@ def write_job_header(self,job):
#end class Rhea


## Added 19/03/2021 by A Zen
class Andes(Supercomputer):

name = 'andes'
requires_account = True
batch_capable = True
#executable_subfile = True
prefixed_output = True
outfile_extension = '.output'
errfile_extension = '.error'

def post_process_job(self,job):
job.run_options.add(
N='-N {}'.format(job.nodes),
n='-n {}'.format(job.processes),
)
if job.threads>1:
job.run_options.add(
c = '-c {}'.format(job.threads),
)
if 'cpu_bind' not in job.run_options:
if job.processes_per_node==self.cores_per_node:
cpu_bind = '--cpu-bind=threads'
else:
cpu_bind = '--cpu-bind=cores'
#end if
job.run_options.add(
cpu_bind = cpu_bind
)
#end if
#end if
#end def post_process_job

def write_job_header(self,job):
if job.queue is None:
job.queue='batch'
#end if
base_partition = None
max_partition = 384
if job.nodes <= 16:
max_time = 48
elif job.nodes <= 64:
max_time = 36
else:
max_time = 3
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time: # warn if job will take more than 96 hrs.
self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours))
job.hours = max_time
job.minutes =0
job.seconds =0
#end if

c='#!/bin/bash\n'
c+='#SBATCH --job-name '+str(job.name)+'\n'
c+='#SBATCH --account='+str(job.account)+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2))
c+='#SBATCH -o {0}\n'.format(job.outfile)
c+='#SBATCH -e {0}\n'.format(job.errfile)
if job.email is not None:
c+='#SBATCH --mail-user {}\n'.format(job.email)
c+='#SBATCH --mail-type ALL\n'
#c+='#SBATCH --mail-type FAIL\n'
#end if
c+='\n'
c+='cd $SLURM_SUBMIT_DIR\n'
c+='\n'
c+='echo JobID : $SLURM_JOBID \n'
c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES \n'
c+='echo List of nodes assigned to the job: $SLURM_NODELIST \n'
c+='\n'
return c
#end def write_job_header
#end class Andes


## Added 19/03/2021 by A Zen
class Andes(Supercomputer):

name = 'andes'
requires_account = True
batch_capable = True
#executable_subfile = True
prefixed_output = True
outfile_extension = '.output'
errfile_extension = '.error'

def post_process_job(self,job):
job.run_options.add(
N='-N {}'.format(job.nodes),
n='-n {}'.format(job.processes),
)
if job.threads>1:
job.run_options.add(
c = '-c {}'.format(job.threads),
)
if 'cpu_bind' not in job.run_options:
if job.processes_per_node==self.cores_per_node:
cpu_bind = '--cpu-bind=threads'
else:
cpu_bind = '--cpu-bind=cores'
#end if
job.run_options.add(
cpu_bind = cpu_bind
)
#end if
#end if
#end def post_process_job

def write_job_header(self,job):
if job.queue is None:
job.queue='batch'
#end if
base_partition = None
max_partition = 384
if job.nodes <= 16:
max_time = 48
elif job.nodes <= 64:
max_time = 36
else:
max_time = 3
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time: # warn if job will take more than 96 hrs.
self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours))
job.hours = max_time
job.minutes =0
job.seconds =0
#end if

c='#!/bin/bash\n'
c+='#SBATCH --job-name '+str(job.name)+'\n'
c+='#SBATCH --account='+str(job.account)+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2))
c+='#SBATCH -o {0}\n'.format(job.outfile)
c+='#SBATCH -e {0}\n'.format(job.errfile)
if job.email is not None:
c+='#SBATCH --mail-user {}\n'.format(job.email)
c+='#SBATCH --mail-type ALL\n'
#c+='#SBATCH --mail-type FAIL\n'
#end if
c+='\n'
c+='cd $SLURM_SUBMIT_DIR\n'
c+='\n'
c+='echo JobID : $SLURM_JOBID \n'
Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
#Andes_gpu( 9, 2, 14, 1024, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES \n'
c+='echo List of nodes assigned to the job: $SLURM_NODELIST \n'
c+='\n'
return c
#end def write_job_header
#end class Andes


class Tomcat3(Supercomputer):
name = 'tomcat3'
requires_account = False
Expand Down Expand Up @@ -3214,6 +3370,8 @@ def write_job_header(self,job):
CadesSlurm( 156, 2, 18, 128, 100, 'mpirun', 'sbatch', 'squeue', 'scancel')
Summit( 4608, 2, 21, 512, 100, 'jsrun', 'bsub', 'bjobs', 'bkill')
Rhea( 512, 2, 8, 128, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
#Andes_gpu( 9, 2, 14, 1024, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel')
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')

Expand Down