Skip to content

Commit

Permalink
Merge pull request #137 from ReactionMechanismGenerator/memory
Browse files Browse the repository at this point in the history
Organized job memory handeling
  • Loading branch information
alongd committed Jun 17, 2019
2 parents faed1a0 + 0a956da commit 515462f
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 41 deletions.
2 changes: 1 addition & 1 deletion arc/job/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
input_files = {
'gaussian': """%chk=check.chk
%mem={memory}mb
%nproc={cpus}
%NProcShared={cpus}
#P {job_type_1} {restricted}{method}{slash}{basis} {job_type_2} {fine} {trsh} iop(2/9=2000)
Expand Down
38 changes: 28 additions & 10 deletions arc/job/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class Job(object):
`scan_res` ``int`` The rotor scan resolution in degrees
`software` ``str`` The electronic structure software to be used
`server_nodes` ``list`` A list of nodes this job was submitted to (for troubleshooting)
`memory` ``int`` The allocated memory (1500 MB by default)
`memory` ``int`` The total job allocated memory in GB (15 by default)
`method` ``str`` The calculation method (e.g., 'B3LYP', 'CCSD(T)', 'CBS-QB3'...)
`basis_set` ``str`` The basis set (e.g., '6-311++G(d,p)', 'aug-cc-pVTZ'...)
`fine` ``bool`` Whether to use fine geometry optimization parameters
Expand Down Expand Up @@ -97,7 +97,7 @@ class Job(object):
"""
def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_theory, multiplicity,
project_directory, charge=0, conformer=-1, fine=False, shift='', software=None, is_ts=False, scan='',
pivots=None, memory=15000, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
pivots=None, memory=15, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
initial_trsh=None, job_num=None, job_server_name=None, job_name=None, job_id=None, server=None,
initial_time=None, occ=None, max_job_time=120, scan_res=None, checkfile=None, number_of_radicals=None,
testing=False):
Expand Down Expand Up @@ -295,10 +295,30 @@ def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_

self.server = server if server is not None else self.ess_settings[self.software][0]

self.cpus = servers[self.server].get('cpus', 8) # set to 8 by default
self.mem_per_cpu = memory * 1000 / self.cpus # The `#SBATCH --mem-per-cpu` directive is in MB
max_mem = servers[self.server].get('memory', None) # max memory per node
if max_mem is not None and memory > max_mem * 0.9:
logging.warning('The memory for job {0} using {1} ({2} GB) exceeds 90% of the the maximum node memory on '
'{3}. Setting it to 90% * {4} GB.'.format(self.job_name, self.software,
memory, self.server, max_mem))
memory = 0.9 * max_mem
self.memory_gb = memory # store the memory in GB for troubleshooting (when re-running the job)
if self.software == 'molpro':
# molpro's memory is in MW, 1500 MW should be enough as an initial general memory requirement assessment
memory /= 10
self.memory = memory
# Molpro's memory is per cpu and in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
self.memory = memory * 128 / self.cpus
if self.software == 'terachem':
# TeraChem's memory is in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
self.memory = memory * 128
elif self.software == 'gaussian':
# Gaussian's memory is in MB, total for all cpus
self.memory = memory * 1000
elif self.software == 'orca':
# Orca's memory is per cpu and in MB
self.memory = memory * 1000 / self.cpus
elif self.software == 'qchem':
pass # QChem manages its memory automatically, for now ARC will not intervene
# see http://www.q-chem.com/qchem-website/manual/qchem44_manual/CCparallel.html

self.fine = fine
self.shift = shift
Expand Down Expand Up @@ -452,18 +472,17 @@ def write_submit_script(self):
else:
raise JobError('Could not determine format for maximal job time.\n Format is determined by {0}, but '
'got {1} for {2}'.format(t_max_format, servers[self.server]['cluster_soft'], self.server))
cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
architecture = ''
if self.server.lower() == 'pharos':
# here we're hard-coding ARC for Pharos, a Green Group server
# If your server has different node architectures, implement something similar
if cpus <= 8:
if self.cpus <= 8:
architecture = '\n#$ -l harpertown'
else:
architecture = '\n#$ -l magnycours'
try:
self.submit = submit_scripts[self.server][self.software.lower()].format(
name=self.job_server_name, un=un, t_max=t_max, mem_cpu=int(self.memory / cpus), cpus=cpus,
name=self.job_server_name, un=un, t_max=t_max, mem_per_cpu=int(self.mem_per_cpu), cpus=self.cpus,
architecture=architecture)
except KeyError:
logging.error('Could not find submit script for server {0}, make sure your submit scripts '
Expand Down Expand Up @@ -719,10 +738,9 @@ def write_input_file(self):
raise
else:
try:
cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
self.input = self.input.format(memory=self.memory, method=self.method, slash=slash, bath=self.bath_gas,
basis=self.basis_set, charge=self.charge, multiplicity=self.multiplicity,
spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=cpus,
spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=self.cpus,
job_type_2=job_type_2, scan=scan_string, restricted=restricted,
fine=fine, shift=self.shift, trsh=self.trsh, scan_trsh=self.scan_trsh,)
except KeyError:
Expand Down
2 changes: 1 addition & 1 deletion arc/job/jobTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def test_automatic_ess_assignment(self):
project_directory=os.path.join(arc_path, 'Projects', 'project_test'), fine=True, job_num=100)
self.assertEqual(job0.software, 'qchem')

self.assertEqual(job0.memory, 15000)
self.assertEqual(job0.memory_gb, 15)
self.assertEqual(job0.max_job_time, 120)

def test_bath_gas(self):
Expand Down
13 changes: 6 additions & 7 deletions arc/job/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu {mem_cpu}
#SBATCH --mem-per-cpu {mem_per_cpu}
module add c3ddb/gaussian/09.d01
which g09
Expand Down Expand Up @@ -62,7 +62,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu {mem_cpu}
#SBATCH --mem-per-cpu {mem_per_cpu}
module add c3ddb/orca/4.1.2
module add c3ddb/openmpi/3.1.3
Expand Down Expand Up @@ -106,7 +106,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu={mem_cpu}
#SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH -x node07, node05
which 16
Expand Down Expand Up @@ -148,7 +148,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu={mem_cpu}
#SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH -x node07, node05
export PATH=/opt/molpro/molprop_2015_1_linux_x86_64_i8/bin:$PATH
Expand Down Expand Up @@ -187,7 +187,6 @@
#$ -l long{architecture}
#$ -l h_rt={t_max}
#$ -pe singlenode {cpus}
#$ -l h=!node60.cluster
#$ -cwd
#$ -o out.txt
#$ -e err.txt
Expand Down Expand Up @@ -254,7 +253,7 @@
mkdir -p /scratch/{un}/{name}/qlscratch
qchem -nt 6 input.in output.out
qchem -nt {cpus} input.in output.out
rm -r /scratch/{un}/{name}
Expand All @@ -276,7 +275,7 @@
sdir=/scratch/{un}
mkdir -p /scratch/{un}/qlscratch
molpro -d $sdir -n 6 input.in
molpro -d $sdir -n {cpus} input.in
""",
# oneDMin
'onedmin': """#! /bin/bash -l
Expand Down
4 changes: 2 additions & 2 deletions arc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class ARC(object):
`rmgdb` ``RMGDatabase`` The RMG database object
`allow_nonisomorphic_2d` ``bool`` Whether to optimize species even if they do not have a 3D conformer that is
isomorphic to the 2D graph representation
`memory` ``int`` The allocated job memory in MB (1500 MB by default)
`memory` ``int`` The total allocated job memory in GB (15 by default)
`job_types` ``dict`` A dictionary of job types to execute. Keys are job types, values are boolean
`bath_gas` ``str`` A bath gas. Currently used in OneDMin to calc L-J parameters.
Allowed values are He, Ne, Ar, Kr, H2, N2, O2
Expand All @@ -95,7 +95,7 @@ def __init__(self, input_dict=None, project=None, arc_species_list=None, arc_rxn
conformer_level='', composite_method='', opt_level='', freq_level='', sp_level='', scan_level='',
ts_guess_level='', use_bac=True, job_types=None, model_chemistry='', initial_trsh=None, t_min=None,
t_max=None, t_count=None, verbose=logging.INFO, project_directory=None, max_job_time=120,
allow_nonisomorphic_2d=False, job_memory=15000, ess_settings=None, bath_gas=None,
allow_nonisomorphic_2d=False, job_memory=15, ess_settings=None, bath_gas=None,
adaptive_levels=None):
self.__version__ = '1.0.0'
self.verbose = verbose
Expand Down
10 changes: 5 additions & 5 deletions arc/mainTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_as_dict(self):
'reactions': [],
'scan_level': '',
'sp_level': 'ccsd(t)-f12/cc-pvtz-f12',
'job_memory': 15000,
'job_memory': 15,
'job_types': {u'1d_rotors': False,
'conformers': True,
'fine': False,
Expand Down Expand Up @@ -162,7 +162,7 @@ def test_check_project_name(self):
def test_restart(self):
"""
Test restarting ARC through the ARC class in main.py via the input_dict argument of the API
Rather than through ARC.py. Check that all files are in place and tst file content.
Rather than through ARC.py. Check that all files are in place and the log file content.
"""
restart_path = os.path.join(arc_path, 'arc', 'testing', 'restart(H,H2O2,N2H3,CH3CO2).yml')
project = 'arc_project_for_testing_delete_after_usage2'
Expand Down Expand Up @@ -266,9 +266,9 @@ def test_restart(self):
spc2 = Species().fromSMILES(str('CC([O])=O'))
spc2.generate_resonance_structures()
spc2.thermo = db.thermo.getThermoData(spc2)
self.assertAlmostEqual(spc2.getEnthalpy(298), -176074.01886272896, 1)
self.assertAlmostEqual(spc2.getEntropy(298), 283.2225158405262, 1)
self.assertAlmostEqual(spc2.getHeatCapacity(1000), 118.28356605714401, 1)
self.assertAlmostEqual(spc2.getEnthalpy(298), -179231.05071240617, 1)
self.assertAlmostEqual(spc2.getEntropy(298), 283.50278467781203, 1)
self.assertAlmostEqual(spc2.getHeatCapacity(1000), 118.81862727376, 1)
self.assertTrue('arc_project_for_testing_delete_after_usage2' in spc2.thermo.comment)

# delete the generated library from RMG-database
Expand Down
28 changes: 14 additions & 14 deletions arc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
from arc.species.species import ARCSpecies, TSGuess, determine_rotor_symmetry
from arc.species.converter import get_xyz_string, molecules_from_xyz, check_isomorphism
from arc.ts.atst import autotst
from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types
from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types,\
servers

##################################################################

Expand Down Expand Up @@ -82,7 +83,7 @@ class Scheduler(object):
isomorphic to the 2D graph representation
`dont_gen_confs` ``list`` A list of species labels for which conformer jobs were loaded from a restart file,
and additional conformer generation should be avoided
`memory` ``int`` The allocated job memory (1500 MB by default)
`memory` ``int`` The total allocated job memory in GB (15 by default)
`job_types` ``dict`` A dictionary of job types to execute. Keys are job types, values are boolean
`bath_gas` ``str`` A bath gas. Currently used in OneDMin to calc L-J parameters.
Allowed values are He, Ne, Ar, Kr, H2, N2, O2
Expand Down Expand Up @@ -119,7 +120,7 @@ class Scheduler(object):
def __init__(self, project, ess_settings, species_list, composite_method, conformer_level, opt_level, freq_level,
sp_level, scan_level, ts_guess_level, orbitals_level, adaptive_levels, project_directory, rmgdatabase,
job_types=None, initial_trsh=None, rxn_list=None, restart_dict=None, max_job_time=120,
allow_nonisomorphic_2d=False, memory=15000, testing=False, bath_gas=None):
allow_nonisomorphic_2d=False, memory=15, testing=False, bath_gas=None):
self.rmgdb = rmgdatabase
self.restart_dict = restart_dict
self.species_list = species_list
Expand Down Expand Up @@ -1567,9 +1568,10 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
conformer=conformer)
elif 'memory' not in job.ess_trsh_methods:
# Increase memory allocation
memory = job.memory * 2
logging.info('Troubleshooting {type} job in {software} using memory: {mem} MB instead of {old} MB'.
format(type=job_type, software=job.software, mem=memory, old=job.memory))
max_mem = servers[job.server].get('memory', 128) # Node memory in GB, default to 128 if not specified
memory = job.memory_gb * 2 if job.memory_gb * 2 < max_mem * 0.9 else max_mem * 0.9
logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB instead of {old} GB'.
format(type=job_type, software=job.software, mem=memory, old=job.memory_gb))
job.ess_trsh_methods.append('memory')
self.run_job(label=label, xyz=xyz, level_of_theory=level_of_theory, software=job.software,
job_type=job_type, fine=job.fine, memory=memory, ess_trsh_methods=job.ess_trsh_methods,
Expand Down Expand Up @@ -1678,11 +1680,9 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
add_mem = float(job.job_status[1].split()[-1]) # parse Molpro's requirement
add_mem = int(math.ceil(add_mem / 100.0)) * 100 # round up to the next hundred
add_mem += 250 # be conservative
memory = job.memory + add_mem
if memory < 5000:
memory = 5000
logging.info('Troubleshooting {type} job in {software} using memory: {mw} MW'.format(
type=job_type, software=job.software, mw=memory))
memory = job.memory_gb + add_mem / 128. # convert MW to GB
logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB'.format(
type=job_type, software=job.software, mem=memory))
self.run_job(label=label, xyz=xyz, level_of_theory=job.level_of_theory, software=job.software,
job_type=job_type, fine=job.fine, shift=job.shift, memory=memory,
ess_trsh_methods=job.ess_trsh_methods, conformer=conformer)
Expand Down Expand Up @@ -1717,9 +1717,9 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
elif 'memory' not in job.ess_trsh_methods:
# Increase memory allocation, also run with a shift
job.ess_trsh_methods.append('memory')
memory = 5000
logging.info('Troubleshooting {type} job in {software} using memory: {mw} MW'.format(
type=job_type, software=job.software, mw=memory))
memory = servers[job.server]['memory'] # set memory to the value of an entire node (in GB)
logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB'.format(
type=job_type, software=job.software, mem=memory))
shift = 'shift,-1.0,-0.5;'
self.run_job(label=label, xyz=xyz, level_of_theory=job.level_of_theory, software=job.software,
job_type=job_type, fine=job.fine, shift=shift, memory=memory,
Expand Down
3 changes: 2 additions & 1 deletion arc/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
'address': 'server2.host.edu',
'un': '<username>',
'key': 'path_to_rsa_key',
'cpus': 48, # optional (default: 8)
'cpus': 48, # number of cpu's per node, optional (default: 8)
'memory': 128, # amount of memory per node in GB, optional (default: 16)
},
'local': {
'cluster_soft': 'OGE',
Expand Down

0 comments on commit 515462f

Please sign in to comment.