Skip to content

Commit

Permalink
Merge pull request #4856 from camelto2/nxs_snl_mpirun
Browse files Browse the repository at this point in the history
update to SNL machines in nexus
  • Loading branch information
ye-luo committed Dec 6, 2023
2 parents 7dcfe2f + 22bf00c commit 150dd85
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 68 deletions.
34 changes: 26 additions & 8 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2773,6 +2773,10 @@ class SnlMachine(Supercomputer):
outfile_extension = '.output'
errfile_extension = '.error'

#for mpiexec
def post_process_job(self,job):
job.run_options.add(bindto="--bind-to core",npernode="--npernode {}".format(job.processes_per_node))

def write_job_header(self,job):
if job.queue is None:
job.queue='batch'
Expand Down Expand Up @@ -2802,13 +2806,12 @@ def write_job_header(self,job):
job.seconds = 0
#end if


c='#!/bin/bash\n'
c+='#SBATCH -p '+str(job.queue)+'\n'
c+='#SBATCH --job-name '+str(job.name)+'\n'
c+='#SBATCH --account='+str(job.account)+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH --cpus-per-task={0}\n'.format(cpus_per_task)
c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2))
c+='#SBATCH -o {0}\n'.format(job.outfile)
c+='#SBATCH -e {0}\n'.format(job.errfile)
Expand All @@ -2835,6 +2838,18 @@ class Attaway(SnlMachine):
name = 'attaway'
#end class Attaway

class Manzano(SnlMachine):
name = 'manzano'
#end class Manzano

class Ghost(SnlMachine):
name = 'ghost'
#end class Ghost

class Amber(SnlMachine):
name = 'amber'
#end class Amber

class Uno(SnlMachine):
name = 'uno'
#end class Uno
Expand Down Expand Up @@ -3625,12 +3640,15 @@ def write_job_header(self,job):
Matisse( 20, 2, 8, 64, 2, 'mpirun', 'sbatch', 'sacct', 'scancel')
Komodo( 24, 2, 6, 48, 2, 'mpirun', 'sbatch', 'sacct', 'scancel')
Amos( 5120, 1, 16, 16, 128, 'srun', 'sbatch', 'sacct', 'scancel')
Chama( 1232, 2, 8, 64, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Uno( 168, 2, 8, 128, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Eclipse( 1488, 2, 18, 128, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Attaway( 1488, 2, 18, 192, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Skybridge( 1848, 2, 8, 64, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Solo( 374, 2, 18, 128, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Chama( 1232, 2, 8, 64, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Uno( 168, 2, 8, 128, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Eclipse( 1488, 2, 18, 128, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Attaway( 1488, 2, 18, 192, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Manzano( 1488, 2, 24, 192, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Ghost( 740, 2, 18, 128, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Amber( 1496, 2, 56, 256, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Skybridge( 1848, 2, 8, 64, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
Solo( 374, 2, 18, 128, 1000, 'mpiexec', 'sbatch', 'squeue', 'scancel')
SuperMUC( 512, 1, 28, 256, 8,'mpiexec', 'llsubmit', 'llq','llcancel')
Stampede2( 4200, 1, 68, 96, 50, 'ibrun', 'sbatch', 'squeue', 'scancel')
CadesMoab( 156, 2, 18, 128, 100, 'mpirun', 'qsub', 'qstat', 'qdel')
Expand Down
171 changes: 111 additions & 60 deletions nexus/tests/unit/test_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,12 @@ def job_commands_equal(c1,c2):
#end def job_command_equal

job_run_ref = obj({
('amber' , 'n1' ) : 'mpiexec --bind-to core -n 112 --npernode 112 test.x',
('amber' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('amber' , 'n2' ) : 'mpiexec --bind-to core -n 224 --npernode 112 test.x',
('amber' , 'n2_t2' ) : 'mpiexec --bind-to core -n 112 --npernode 56 test.x',
('amber' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 112 --npernode 56 test.x',
('amber' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('amos' , 'n1' ) : 'srun test.x',
('amos' , 'n1_p1' ) : 'srun test.x',
('amos' , 'n2' ) : 'srun test.x',
Expand All @@ -1039,12 +1045,12 @@ def job_commands_equal(c1,c2):
('archer2' , 'n2_t2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x',
('archer2' , 'n2_t2_e' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x',
('archer2' , 'n2_t2_p2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 4 test.x',
('attaway' , 'n1' ) : 'srun test.x',
('attaway' , 'n1_p1' ) : 'srun test.x',
('attaway' , 'n2' ) : 'srun test.x',
('attaway' , 'n2_t2' ) : 'srun test.x',
('attaway' , 'n2_t2_e' ) : 'srun test.x',
('attaway' , 'n2_t2_p2' ) : 'srun test.x',
('attaway' , 'n1' ) : 'mpiexec --bind-to core -n 36 --npernode 36 test.x',
('attaway' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('attaway' , 'n2' ) : 'mpiexec --bind-to core -n 72 --npernode 36 test.x',
('attaway' , 'n2_t2' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('attaway' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('attaway' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('bluewaters_xe' , 'n1' ) : 'aprun -n 32 test.x',
('bluewaters_xe' , 'n1_p1' ) : 'aprun -n 1 test.x',
('bluewaters_xe' , 'n2' ) : 'aprun -n 64 test.x',
Expand Down Expand Up @@ -1075,12 +1081,12 @@ def job_commands_equal(c1,c2):
('cetus' , 'n2_t2' ) : 'runjob --envs OMP_NUM_THREADS=2 --np 16 -p 8 --verbose=INFO $LOCARGS : test.x',
('cetus' , 'n2_t2_e' ) : 'runjob --envs OMP_NUM_THREADS=2 ENV_VAR=1 --np 16 -p 8 --verbose=INFO $LOCARGS : test.x',
('cetus' , 'n2_t2_p2' ) : 'runjob --envs OMP_NUM_THREADS=2 --np 4 -p 2 --verbose=INFO $LOCARGS : test.x',
('chama' , 'n1' ) : 'srun test.x',
('chama' , 'n1_p1' ) : 'srun test.x',
('chama' , 'n2' ) : 'srun test.x',
('chama' , 'n2_t2' ) : 'srun test.x',
('chama' , 'n2_t2_e' ) : 'srun test.x',
('chama' , 'n2_t2_p2' ) : 'srun test.x',
('chama' , 'n1' ) : 'mpiexec --bind-to core -n 16 --npernode 16 test.x',
('chama' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('chama' , 'n2' ) : 'mpiexec --bind-to core -n 32 --npernode 16 test.x',
('chama' , 'n2_t2' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('chama' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('chama' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('cooley' , 'n1' ) : 'mpirun -np 12 test.x',
('cooley' , 'n1_p1' ) : 'mpirun -np 1 test.x',
('cooley' , 'n2' ) : 'mpirun -np 24 test.x',
Expand All @@ -1093,18 +1099,24 @@ def job_commands_equal(c1,c2):
('cori' , 'n2_t2' ) : 'srun test.x',
('cori' , 'n2_t2_e' ) : 'srun test.x',
('cori' , 'n2_t2_p2' ) : 'srun test.x',
('eclipse' , 'n1' ) : 'srun test.x',
('eclipse' , 'n1_p1' ) : 'srun test.x',
('eclipse' , 'n2' ) : 'srun test.x',
('eclipse' , 'n2_t2' ) : 'srun test.x',
('eclipse' , 'n2_t2_e' ) : 'srun test.x',
('eclipse' , 'n2_t2_p2' ) : 'srun test.x',
('eclipse' , 'n1' ) : 'mpiexec --bind-to core -n 36 --npernode 36 test.x',
('eclipse' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('eclipse' , 'n2' ) : 'mpiexec --bind-to core -n 72 --npernode 36 test.x',
('eclipse' , 'n2_t2' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('eclipse' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('eclipse' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('eos' , 'n1' ) : 'aprun -n 16 test.x',
('eos' , 'n1_p1' ) : 'aprun -n 1 test.x',
('eos' , 'n2' ) : 'aprun -n 32 test.x',
('eos' , 'n2_t2' ) : 'aprun -ss -cc numa_node -d 2 -n 16 test.x',
('eos' , 'n2_t2_e' ) : 'aprun -ss -cc numa_node -d 2 -n 16 test.x',
('eos' , 'n2_t2_p2' ) : 'aprun -ss -cc numa_node -d 2 -n 4 test.x',
('ghost' , 'n1' ) : 'mpiexec --bind-to core -n 36 --npernode 36 test.x',
('ghost' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('ghost' , 'n2' ) : 'mpiexec --bind-to core -n 72 --npernode 36 test.x',
('ghost' , 'n2_t2' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('ghost' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('ghost' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('jaguar' , 'n1' ) : 'aprun -n 16 test.x',
('jaguar' , 'n1_p1' ) : 'aprun -n 1 test.x',
('jaguar' , 'n2' ) : 'aprun -n 32 test.x',
Expand All @@ -1129,6 +1141,12 @@ def job_commands_equal(c1,c2):
('lonestar' , 'n2_t2' ) : 'ibrun -n 12 -o 0 test.x',
('lonestar' , 'n2_t2_e' ) : 'ibrun -n 12 -o 0 test.x',
('lonestar' , 'n2_t2_p2' ) : 'ibrun -n 4 -o 0 test.x',
('manzano' , 'n1' ) : 'mpiexec --bind-to core -n 48 --npernode 48 test.x',
('manzano' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('manzano' , 'n2' ) : 'mpiexec --bind-to core -n 96 --npernode 48 test.x',
('manzano' , 'n2_t2' ) : 'mpiexec --bind-to core -n 48 --npernode 24 test.x',
('manzano' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 48 --npernode 24 test.x',
('manzano' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('matisse' , 'n1' ) : 'mpirun -np 16 test.x',
('matisse' , 'n1_p1' ) : 'mpirun -np 1 test.x',
('matisse' , 'n2' ) : 'mpirun -np 32 test.x',
Expand Down Expand Up @@ -1165,18 +1183,18 @@ def job_commands_equal(c1,c2):
('rhea' , 'n2_t2' ) : 'srun -N 2 -n 16 -c 2 --cpu-bind=cores test.x',
('rhea' , 'n2_t2_e' ) : 'srun -N 2 -n 16 -c 2 --cpu-bind=cores test.x',
('rhea' , 'n2_t2_p2' ) : 'srun -N 2 -n 4 -c 2 --cpu-bind=cores test.x',
('skybridge' , 'n1' ) : 'srun test.x',
('skybridge' , 'n1_p1' ) : 'srun test.x',
('skybridge' , 'n2' ) : 'srun test.x',
('skybridge' , 'n2_t2' ) : 'srun test.x',
('skybridge' , 'n2_t2_e' ) : 'srun test.x',
('skybridge' , 'n2_t2_p2' ) : 'srun test.x',
('solo' , 'n1' ) : 'srun test.x',
('solo' , 'n1_p1' ) : 'srun test.x',
('solo' , 'n2' ) : 'srun test.x',
('solo' , 'n2_t2' ) : 'srun test.x',
('solo' , 'n2_t2_e' ) : 'srun test.x',
('solo' , 'n2_t2_p2' ) : 'srun test.x',
('skybridge' , 'n1' ) : 'mpiexec --bind-to core -n 16 --npernode 16 test.x',
('skybridge' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('skybridge' , 'n2' ) : 'mpiexec --bind-to core -n 32 --npernode 16 test.x',
('skybridge' , 'n2_t2' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('skybridge' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('skybridge' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('solo' , 'n1' ) : 'mpiexec --bind-to core -n 36 --npernode 36 test.x',
('solo' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('solo' , 'n2' ) : 'mpiexec --bind-to core -n 72 --npernode 36 test.x',
('solo' , 'n2_t2' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('solo' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 36 --npernode 18 test.x',
('solo' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('stampede2' , 'n1' ) : 'ibrun -n 68 -o 0 test.x',
('stampede2' , 'n1_p1' ) : 'ibrun -n 1 -o 0 test.x',
('stampede2' , 'n2' ) : 'ibrun -n 136 -o 0 test.x',
Expand Down Expand Up @@ -1227,12 +1245,12 @@ def job_commands_equal(c1,c2):
('tomcat3' , 'n2_t2' ) : 'mpirun -np 64 test.x',
('tomcat3' , 'n2_t2_e' ) : 'mpirun -np 64 test.x',
('tomcat3' , 'n2_t2_p2' ) : 'mpirun -np 4 test.x',
('uno' , 'n1' ) : 'srun test.x',
('uno' , 'n1_p1' ) : 'srun test.x',
('uno' , 'n2' ) : 'srun test.x',
('uno' , 'n2_t2' ) : 'srun test.x',
('uno' , 'n2_t2_e' ) : 'srun test.x',
('uno' , 'n2_t2_p2' ) : 'srun test.x',
('uno' , 'n1' ) : 'mpiexec --bind-to core -n 16 --npernode 16 test.x',
('uno' , 'n1_p1' ) : 'mpiexec --bind-to core -n 1 --npernode 1 test.x',
('uno' , 'n2' ) : 'mpiexec --bind-to core -n 32 --npernode 16 test.x',
('uno' , 'n2_t2' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('uno' , 'n2_t2_e' ) : 'mpiexec --bind-to core -n 16 --npernode 8 test.x',
('uno' , 'n2_t2_p2' ) : 'mpiexec --bind-to core -n 4 --npernode 2 test.x',
('vesta' , 'n1' ) : 'runjob --envs OMP_NUM_THREADS=1 --np 16 -p 16 --verbose=INFO $LOCARGS : test.x',
('vesta' , 'n1_p1' ) : 'runjob --envs OMP_NUM_THREADS=1 --np 1 -p 1 --verbose=INFO $LOCARGS : test.x',
('vesta' , 'n2' ) : 'runjob --envs OMP_NUM_THREADS=1 --np 32 -p 16 --verbose=INFO $LOCARGS : test.x',
Expand Down Expand Up @@ -1368,6 +1386,19 @@ def test_write_job():
Machine.allow_warnings = False

job_write_ref = dict(
amber = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export ENV_VAR=1
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 224 --npernode 112 test.x''',
amos = '''#!/bin/bash -x
#SBATCH --export=ALL
#SBATCH -J None
Expand Down Expand Up @@ -1425,15 +1456,14 @@ def test_write_job():
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=36
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 72 --npernode 36 test.x''',
bluewaters_xe = '''#!/bin/bash
#PBS -N jobname
#PBS -l walltime=06:30:00
Expand Down Expand Up @@ -1513,15 +1543,14 @@ def test_write_job():
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 32 --npernode 16 test.x''',
cooley = '''#!/bin/bash
#COBALT -q default
#COBALT -A ABC123
Expand Down Expand Up @@ -1555,15 +1584,14 @@ def test_write_job():
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=36
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 72 --npernode 36 test.x''',
eos = '''#!/bin/bash
#PBS -A ABC123
#PBS -q batch
Expand All @@ -1581,6 +1609,19 @@ def test_write_job():
export OMP_NUM_THREADS=1
export ENV_VAR=1
aprun -n 32 test.x''',
ghost = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export ENV_VAR=1
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 72 --npernode 36 test.x''',
jaguar = '''#!/bin/bash
#PBS -A ABC123
#PBS -q batch
Expand Down Expand Up @@ -1649,6 +1690,19 @@ def test_write_job():
export OMP_NUM_THREADS=1
export ENV_VAR=1
ibrun -n 24 -o 0 test.x''',
manzano = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export ENV_VAR=1
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 96 --npernode 48 test.x''',
matisse = '''#!/bin/bash -x
#SBATCH --export=ALL
#SBATCH -J None
Expand Down Expand Up @@ -1749,29 +1803,27 @@ def test_write_job():
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 32 --npernode 16 test.x''',
solo = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=36
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 72 --npernode 36 test.x''',
stampede2 = '''#!/bin/bash
#SBATCH --job-name jobname
#SBATCH --account=ABC123
Expand Down Expand Up @@ -1901,15 +1953,14 @@ def test_write_job():
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
export OMP_NUM_THREADS=1
mpiexec --bind-to core -n 32 --npernode 16 test.x''',
vesta = '''#!/bin/bash
#COBALT -q default
#COBALT -A ABC123
Expand Down

0 comments on commit 150dd85

Please sign in to comment.