Skip to content

Commit

Permalink
Merge pull request #1394 from jtkrogel/nx_summit
Browse files Browse the repository at this point in the history
Nexus: add support for Summit
  • Loading branch information
ye-luo committed Feb 19, 2019
2 parents 4a26459 + 951727b commit 3268f08
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 3 deletions.
29 changes: 28 additions & 1 deletion nexus/bin/nxs-test
Original file line number Diff line number Diff line change
Expand Up @@ -2259,6 +2259,14 @@ def machines():
('stampede2' , 'n2_t2' ) : 'ibrun -n 68 -o 0 test.x',
('stampede2' , 'n2_t2_e' ) : 'ibrun -n 68 -o 0 test.x',
('stampede2' , 'n2_t2_p2' ) : 'ibrun -n 4 -o 0 test.x',
('summit' , 'n1' ) : 'jsrun -a 21 -r 2 -b rs -c 21 -d packed -n 2 -g 0 test.x',
('summit' , 'n1_g6' ) : 'jsrun -a 7 -r 6 -b rs -c 7 -d packed -n 6 -g 1 test.x',
('summit' , 'n2' ) : 'jsrun -a 21 -r 2 -b rs -c 21 -d packed -n 4 -g 0 test.x',
('summit' , 'n2_g6' ) : 'jsrun -a 7 -r 6 -b rs -c 7 -d packed -n 12 -g 1 test.x',
('summit' , 'n2_t2' ) : 'jsrun -a 10 -r 2 -b rs -c 20 -d packed -n 4 -g 0 test.x',
('summit' , 'n2_t2_e' ) : 'jsrun -a 10 -r 2 -b rs -c 20 -d packed -n 4 -g 0 test.x',
('summit' , 'n2_t2_e_g6' ) : 'jsrun -a 3 -r 6 -b rs -c 6 -d packed -n 12 -g 1 test.x',
('summit' , 'n2_t2_g6' ) : 'jsrun -a 3 -r 6 -b rs -c 6 -d packed -n 12 -g 1 test.x',
('supermuc' , 'n1' ) : 'mpiexec -n 40 test.x',
('supermuc' , 'n1_p1' ) : 'mpiexec -n 1 test.x',
('supermuc' , 'n2' ) : 'mpiexec -n 80 test.x',
Expand Down Expand Up @@ -2297,7 +2305,7 @@ def machines():
('vesta' , 'n2_t2_p2' ) : 'runjob --np 4 -p 2 $LOCARGS --verbose=INFO --envs OMP_NUM_THREADS=2 : test.x',
})

job_inputs = obj(
job_inputs_orig = obj(
n1 = obj(nodes=1),
n1_p1 = obj(nodes=1,processes_per_node=1),
n2 = obj(nodes=2),
Expand All @@ -2312,6 +2320,22 @@ def machines():
else:
acc = None
#end if
job_inputs = job_inputs_orig
if name=='summit': # exceptional treatment for summit nodes
job_inputs = job_inputs_orig.copy()
jtypes = list(job_inputs.keys())
for jtype in jtypes:
if 'p' in jtype:
del job_inputs[jtype]
else:
jcpu = job_inputs[jtype]
jcpu.gpus = 0
jgpu = jcpu.copy()
jgpu.gpus = 6
job_inputs[jtype+'_g6'] = jgpu
#end if
#end for
#end if
for jtype in sorted(job_inputs.keys()):
job = Job(app_command = 'test.x',
machine = name,
Expand Down Expand Up @@ -2339,6 +2363,9 @@ def machines():
if m.app_launcher=='srun': # no slurm support yet
continue
#end if
if name=='summit': # no summit support
continue
#end if
if m.requires_account:
acc = 'ABC123'
else:
Expand Down
136 changes: 134 additions & 2 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def __init__(self,
threads = 1, # number of openmp threads for the job
hyperthreads = None,
ppn = None,
gpus = None, # number of gpus per node
compiler = None,
options = None,
app_options = None,
Expand All @@ -224,6 +225,7 @@ def __init__(self,
processes_per_node = None,
email = None,
constraint = None, # slurm specific, Cori
alloc_flags = None, # lsf specific, Summit
fake = False,
):

Expand All @@ -249,6 +251,7 @@ def __init__(self,
self.threads = threads
self.hyperthreads = hyperthreads
self.ppn = ppn
self.gpus = gpus
self.compiler = compiler
self.app_options = Options()
self.run_options = Options()
Expand All @@ -268,6 +271,7 @@ def __init__(self,
self.account = account
self.email = email
self.constraint = constraint
self.alloc_flags = alloc_flags
self.internal_id = None
self.system_id = None
self.tot_cores = None
Expand Down Expand Up @@ -611,6 +615,14 @@ def ll_walltime(self):
+str(int(self.seconds)).zfill(2)
return walltime
#end def ll_walltime


def lsf_walltime(self):
walltime=\
str(int(24*self.days+self.hours)).zfill(2)+':'\
+str(int(self.minutes)).zfill(2)
return walltime
#end def lsf_walltime


def normalize_time(self):
Expand Down Expand Up @@ -1323,10 +1335,19 @@ def __init__(self,
EP = 'preempt_pending',
MP = 'resume_pending',
)
elif self.queue_querier=='bjobs':
self.job_states=dict(PEND = 'pending',
RUN = 'running',
DONE = 'complete',
EXIT = 'failed',
PSUSP = 'suspended',
USUSP = 'suspended',
SSUSP = 'suspended',
)
else:
self.error('ability to query queue with '+self.queue_querier+' has not yet been implemented')
#end if

#end def __init__


Expand Down Expand Up @@ -1458,6 +1479,8 @@ def process_job_options(self,job):
np = '-n '+str(job.processes),
p = '-o '+str(0),
)
elif launcher=='jsrun': # Summit
None # Summit class takes care of this in process_job_extra
else:
self.error(launcher+' is not yet implemented as an application launcher')
#end if
Expand All @@ -1473,7 +1496,7 @@ def query_queue(self,out=None):
self.system_queue.clear()
if self.queue_querier=='qstat':
if out is None:
out,err = Popen('qstat -a',shell=True,stdout=PIPE,stderr=PIPE,close_fds=True).communicate()
out,err = Popen('qstat -a',shell=True,stdout=PIPE,stderr=PIPE,close_fds=True).communicate()
#end if
lines = out.splitlines()
for line in lines:
Expand Down Expand Up @@ -1612,6 +1635,26 @@ def query_queue(self,out=None):
#end if
#end if
#end for
elif self.queue_querier=='bjobs':
if out is None:
out,err = Popen('bjobs',shell=True,stdout=PIPE,stderr=PIPE,close_fds=True).communicate()
#end if
lines = out.splitlines()
for line in lines:
tokens=line.split()
if len(tokens)>0:
spid = tokens[0]
if spid.isdigit() and len(tokens)==8:
pid = int(spid)
jid,uname,status,slots,queue,start,finish,jname = tokens
if status in self.job_states:
self.system_queue[pid] = self.job_states[status]
else:
self.error('job state '+status+' is unrecognized')
#end if
#end if
#end if
#end for
else:
self.error('ability to query queue with '+self.queue_querier+' has not yet been implemented')
#end if
Expand Down Expand Up @@ -2981,6 +3024,94 @@ def write_job_header(self,job):
#end class Cades



class Summit(Supercomputer):

name = 'summit'
requires_account = True
batch_capable = True

def process_job_extra(self,job):
# add the options only if the user has not supplied options
if len(job.run_options)==0:
opt = obj(
launch_dist = '-d packed',
bind = '-b rs',
)
if job.gpus is None:
job.gpus = 6 # gpus to use per node
#end if
if job.alloc_flags is None:
job.alloc_flags = 'smt1'
#end if
if job.gpus==0:
if job.processes%2==0:
resource_sets_per_node = 2
else:
resource_sets_per_node = 1
#end if
nrs = job.nodes*resource_sets_per_node
pprs = job.processes_per_node/resource_sets_per_node
gpurs = 0
else:
ppn = job.processes_per_node
if ppn is None:
self.warn('job may not run properly on Summit\nat least one mpi process should be present for each node\nplease check the generated bsub file for correctness')
ppn = 0
#end if
if ppn%job.gpus!=0:
self.warn('job may not run properly on Summit\nprocesses per node should divide evenly into number of gpus requested\nprocesses per node requested: {0}\ngpus per node requested: {1}\nplease check the generated bsub file for correctness'.format(job.processes_per_node,job.gpus))
#end if
resource_sets_per_node = job.gpus
nrs = job.nodes*resource_sets_per_node
pprs = ppn/resource_sets_per_node
gpurs = 1
#end if
opt.set(
resource_sets= '-n {0}'.format(nrs),
rs_per_node = '-r {0}'.format(resource_sets_per_node),
tasks_per_rs = '-a {0}'.format(pprs),
cpus_per_rs = '-c {0}'.format(pprs*job.threads),
gpus_per_rs = '-g {0}'.format(gpurs),
)
job.run_options.add(**opt)
#end if
#end def process_job_extra


def write_job_header(self,job):
c ='#!/bin/bash\n'
c+='#BSUB -P {0}\n'.format(job.account)
c+='#BSUB -J {0}\n'.format(job.name)
c+='#BSUB -o {0}\n'.format(job.outfile)
c+='#BSUB -e {0}\n'.format(job.errfile)
c+='#BSUB -W {0}\n'.format(job.lsf_walltime())
c+='#BSUB -nnodes {0}\n'.format(job.nodes)
if job.alloc_flags is not None:
c+='#BSUB -alloc_flags "{0}"\n'.format(job.alloc_flags)
#end if
return c
#end def write_job_header


def read_process_id(self,output):
pid = None
tokens = output.split()
for t in tokens:
if t.startswith('<'):
spid = t.strip('<>').strip()
if spid.isdigit():
pid = int(spid)
break
#end if
#end if
#end for
return pid
#end def read_process_id
#end class Summit



#Known machines
# workstations
for cores in range(1,128+1):
Expand Down Expand Up @@ -3017,6 +3148,7 @@ def write_job_header(self,job):
SuperMUC( 205, 4, 10, 256, 8,'mpiexec', 'llsubmit', 'llq','llcancel')
Stampede2( 4200, 1, 68, 96, 50, 'ibrun', 'sbatch', 'squeue', 'scancel')
Cades( 156, 2, 18, 128, 100, 'mpirun', 'qsub', 'qstat', 'qdel')
Summit( 4608, 2, 21, 512, 100, 'jsrun', 'bsub', 'bjobs', 'bkill')

#machine accessor functions
get_machine_name = Machine.get_hostname
Expand Down

0 comments on commit 3268f08

Please sign in to comment.