Merge pull request #137 from ReactionMechanismGenerator/memory

Organized job memory handeling
ReactionMechanismGenerator · Jun 17, 2019 · 515462f · 515462f
2 parents faed1a0 + 0a956da
commit 515462f
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 41 deletions.
diff --git a/arc/job/inputs.py b/arc/job/inputs.py
@@ -36,7 +36,7 @@
 input_files = {
     'gaussian': """%chk=check.chk
 %mem={memory}mb
-%nproc={cpus}
+%NProcShared={cpus}
 
 #P {job_type_1} {restricted}{method}{slash}{basis} {job_type_2} {fine} {trsh} iop(2/9=2000)
 

diff --git a/arc/job/job.py b/arc/job/job.py
@@ -55,7 +55,7 @@ class Job(object):
     `scan_res`         ``int``           The rotor scan resolution in degrees
     `software`         ``str``           The electronic structure software to be used
     `server_nodes`     ``list``          A list of nodes this job was submitted to (for troubleshooting)
-    `memory`           ``int``           The allocated memory (1500 MB by default)
+    `memory`           ``int``           The total job allocated memory in GB (15 by default)
     `method`           ``str``           The calculation method (e.g., 'B3LYP', 'CCSD(T)', 'CBS-QB3'...)
     `basis_set`        ``str``           The basis set (e.g., '6-311++G(d,p)', 'aug-cc-pVTZ'...)
     `fine`             ``bool``          Whether to use fine geometry optimization parameters
@@ -97,7 +97,7 @@ class Job(object):
     """
     def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_theory, multiplicity,
                  project_directory, charge=0, conformer=-1, fine=False, shift='', software=None, is_ts=False, scan='',
-                 pivots=None, memory=15000, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
+                 pivots=None, memory=15, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
                  initial_trsh=None, job_num=None, job_server_name=None, job_name=None, job_id=None, server=None,
                  initial_time=None, occ=None, max_job_time=120, scan_res=None, checkfile=None, number_of_radicals=None,
                  testing=False):
@@ -295,10 +295,30 @@ def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_
 
         self.server = server if server is not None else self.ess_settings[self.software][0]
 
+        self.cpus = servers[self.server].get('cpus', 8)  # set to 8 by default
+        self.mem_per_cpu = memory * 1000 / self.cpus  # The `#SBATCH --mem-per-cpu` directive is in MB
+        max_mem = servers[self.server].get('memory', None)  # max memory per node
+        if max_mem is not None and memory > max_mem * 0.9:
+            logging.warning('The memory for job {0} using {1} ({2} GB) exceeds 90% of the the maximum node memory on '
+                            '{3}. Setting it to 90% * {4} GB.'.format(self.job_name, self.software,
+                                                                      memory, self.server, max_mem))
+            memory = 0.9 * max_mem
+        self.memory_gb = memory  # store the memory in GB for troubleshooting (when re-running the job)
         if self.software == 'molpro':
-            # molpro's memory is in MW, 1500 MW should be enough as an initial general memory requirement assessment
-            memory /= 10
-        self.memory = memory
+            # Molpro's memory is per cpu and in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
+            self.memory = memory * 128 / self.cpus
+        if self.software == 'terachem':
+            # TeraChem's memory is in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
+            self.memory = memory * 128
+        elif self.software == 'gaussian':
+            # Gaussian's memory is in MB, total for all cpus
+            self.memory = memory * 1000
+        elif self.software == 'orca':
+            # Orca's memory is per cpu and in MB
+            self.memory = memory * 1000 / self.cpus
+        elif self.software == 'qchem':
+            pass  # QChem manages its memory automatically, for now ARC will not intervene
+            # see http://www.q-chem.com/qchem-website/manual/qchem44_manual/CCparallel.html
 
         self.fine = fine
         self.shift = shift
@@ -452,18 +472,17 @@ def write_submit_script(self):
         else:
             raise JobError('Could not determine format for maximal job time.\n Format is determined by {0}, but '
                            'got {1} for {2}'.format(t_max_format, servers[self.server]['cluster_soft'], self.server))
-        cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
         architecture = ''
         if self.server.lower() == 'pharos':
             # here we're hard-coding ARC for Pharos, a Green Group server
             # If your server has different node architectures, implement something similar
-            if cpus <= 8:
+            if self.cpus <= 8:
                 architecture = '\n#$ -l harpertown'
             else:
                 architecture = '\n#$ -l magnycours'
         try:
             self.submit = submit_scripts[self.server][self.software.lower()].format(
-                name=self.job_server_name, un=un, t_max=t_max, mem_cpu=int(self.memory / cpus), cpus=cpus,
+                name=self.job_server_name, un=un, t_max=t_max, mem_per_cpu=int(self.mem_per_cpu), cpus=self.cpus,
                 architecture=architecture)
         except KeyError:
             logging.error('Could not find submit script for server {0}, make sure your submit scripts '
@@ -719,10 +738,9 @@ def write_input_file(self):
                     raise
         else:
             try:
-                cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
                 self.input = self.input.format(memory=self.memory, method=self.method, slash=slash, bath=self.bath_gas,
                                                basis=self.basis_set, charge=self.charge, multiplicity=self.multiplicity,
-                                               spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=cpus,
+                                               spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=self.cpus,
                                                job_type_2=job_type_2, scan=scan_string, restricted=restricted,
                                                fine=fine, shift=self.shift, trsh=self.trsh, scan_trsh=self.scan_trsh,)
             except KeyError:

diff --git a/arc/job/jobTest.py b/arc/job/jobTest.py
@@ -118,7 +118,7 @@ def test_automatic_ess_assignment(self):
                    project_directory=os.path.join(arc_path, 'Projects', 'project_test'), fine=True, job_num=100)
         self.assertEqual(job0.software, 'qchem')
 
-        self.assertEqual(job0.memory, 15000)
+        self.assertEqual(job0.memory_gb, 15)
         self.assertEqual(job0.max_job_time, 120)
 
     def test_bath_gas(self):

diff --git a/arc/job/submit.py b/arc/job/submit.py
@@ -18,7 +18,7 @@
 #SBATCH -N 1
 #SBATCH -n {cpus}
 #SBATCH --time={t_max}
-#SBATCH --mem-per-cpu {mem_cpu}
+#SBATCH --mem-per-cpu {mem_per_cpu}
 
 module add c3ddb/gaussian/09.d01
 which g09
@@ -62,7 +62,7 @@
 #SBATCH -N 1
 #SBATCH -n {cpus}
 #SBATCH --time={t_max}
-#SBATCH --mem-per-cpu {mem_cpu}
+#SBATCH --mem-per-cpu {mem_per_cpu}
 
 module add c3ddb/orca/4.1.2
 module add c3ddb/openmpi/3.1.3
@@ -106,7 +106,7 @@
 #SBATCH -N 1
 #SBATCH -n {cpus}
 #SBATCH --time={t_max}
-#SBATCH --mem-per-cpu={mem_cpu}
+#SBATCH --mem-per-cpu={mem_per_cpu}
 #SBATCH -x node07, node05
 
 which 16
@@ -148,7 +148,7 @@
 #SBATCH -N 1
 #SBATCH -n {cpus}
 #SBATCH --time={t_max}
-#SBATCH --mem-per-cpu={mem_cpu}
+#SBATCH --mem-per-cpu={mem_per_cpu}
 #SBATCH -x node07, node05
 
 export PATH=/opt/molpro/molprop_2015_1_linux_x86_64_i8/bin:$PATH
@@ -187,7 +187,6 @@
 #$ -l long{architecture}
 #$ -l h_rt={t_max}
 #$ -pe singlenode {cpus}
-#$ -l h=!node60.cluster
 #$ -cwd
 #$ -o out.txt
 #$ -e err.txt
@@ -254,7 +253,7 @@
 
 mkdir -p /scratch/{un}/{name}/qlscratch
 
-qchem -nt 6 input.in output.out
+qchem -nt {cpus} input.in output.out
 
 rm -r /scratch/{un}/{name}
 
@@ -276,7 +275,7 @@
 sdir=/scratch/{un}
 mkdir -p /scratch/{un}/qlscratch
 
-molpro -d $sdir -n 6 input.in
+molpro -d $sdir -n {cpus} input.in
 """,
         # oneDMin
         'onedmin': """#! /bin/bash -l

diff --git a/arc/main.py b/arc/main.py
@@ -82,7 +82,7 @@ class ARC(object):
     `rmgdb`                ``RMGDatabase``  The RMG database object
     `allow_nonisomorphic_2d` ``bool`` Whether to optimize species even if they do not have a 3D conformer that is
                                         isomorphic to the 2D graph representation
-    `memory`               ``int``    The allocated job memory in MB (1500 MB by default)
+    `memory`               ``int``    The total allocated job memory in GB (15 by default)
     `job_types`            ``dict``   A dictionary of job types to execute. Keys are job types, values are boolean
     `bath_gas`             ``str``    A bath gas. Currently used in OneDMin to calc L-J parameters.
                                         Allowed values are He, Ne, Ar, Kr, H2, N2, O2
@@ -95,7 +95,7 @@ def __init__(self, input_dict=None, project=None, arc_species_list=None, arc_rxn
                  conformer_level='', composite_method='', opt_level='', freq_level='', sp_level='', scan_level='',
                  ts_guess_level='', use_bac=True, job_types=None, model_chemistry='', initial_trsh=None, t_min=None,
                  t_max=None, t_count=None, verbose=logging.INFO, project_directory=None, max_job_time=120,
-                 allow_nonisomorphic_2d=False, job_memory=15000, ess_settings=None, bath_gas=None,
+                 allow_nonisomorphic_2d=False, job_memory=15, ess_settings=None, bath_gas=None,
                  adaptive_levels=None):
         self.__version__ = '1.0.0'
         self.verbose = verbose

diff --git a/arc/mainTest.py b/arc/mainTest.py
@@ -66,7 +66,7 @@ def test_as_dict(self):
                          'reactions': [],
                          'scan_level': '',
                          'sp_level': 'ccsd(t)-f12/cc-pvtz-f12',
-                         'job_memory': 15000,
+                         'job_memory': 15,
                          'job_types': {u'1d_rotors': False,
                                        'conformers': True,
                                        'fine': False,
@@ -162,7 +162,7 @@ def test_check_project_name(self):
     def test_restart(self):
         """
         Test restarting ARC through the ARC class in main.py via the input_dict argument of the API
-        Rather than through ARC.py. Check that all files are in place and tst file content.
+        Rather than through ARC.py. Check that all files are in place and the log file content.
         """
         restart_path = os.path.join(arc_path, 'arc', 'testing', 'restart(H,H2O2,N2H3,CH3CO2).yml')
         project = 'arc_project_for_testing_delete_after_usage2'
@@ -266,9 +266,9 @@ def test_restart(self):
         spc2 = Species().fromSMILES(str('CC([O])=O'))
         spc2.generate_resonance_structures()
         spc2.thermo = db.thermo.getThermoData(spc2)
-        self.assertAlmostEqual(spc2.getEnthalpy(298), -176074.01886272896, 1)
-        self.assertAlmostEqual(spc2.getEntropy(298), 283.2225158405262, 1)
-        self.assertAlmostEqual(spc2.getHeatCapacity(1000), 118.28356605714401, 1)
+        self.assertAlmostEqual(spc2.getEnthalpy(298), -179231.05071240617, 1)
+        self.assertAlmostEqual(spc2.getEntropy(298), 283.50278467781203, 1)
+        self.assertAlmostEqual(spc2.getHeatCapacity(1000), 118.81862727376, 1)
         self.assertTrue('arc_project_for_testing_delete_after_usage2' in spc2.thermo.comment)
 
         # delete the generated library from RMG-database

diff --git a/arc/scheduler.py b/arc/scheduler.py
@@ -33,7 +33,8 @@
 from arc.species.species import ARCSpecies, TSGuess, determine_rotor_symmetry
 from arc.species.converter import get_xyz_string, molecules_from_xyz, check_isomorphism
 from arc.ts.atst import autotst
-from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types
+from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types,\
+    servers
 
 ##################################################################
 
@@ -82,7 +83,7 @@ class Scheduler(object):
                                         isomorphic to the 2D graph representation
     `dont_gen_confs`        ``list``  A list of species labels for which conformer jobs were loaded from a restart file,
                                         and additional conformer generation should be avoided
-    `memory`                ``int``   The allocated job memory (1500 MB by default)
+    `memory`                ``int``   The total allocated job memory in GB (15 by default)
     `job_types`             ``dict``  A dictionary of job types to execute. Keys are job types, values are boolean
     `bath_gas`              ``str``   A bath gas. Currently used in OneDMin to calc L-J parameters.
                                         Allowed values are He, Ne, Ar, Kr, H2, N2, O2
@@ -119,7 +120,7 @@ class Scheduler(object):
     def __init__(self, project, ess_settings, species_list, composite_method, conformer_level, opt_level, freq_level,
                  sp_level, scan_level, ts_guess_level, orbitals_level, adaptive_levels, project_directory, rmgdatabase,
                  job_types=None, initial_trsh=None, rxn_list=None, restart_dict=None, max_job_time=120,
-                 allow_nonisomorphic_2d=False, memory=15000, testing=False, bath_gas=None):
+                 allow_nonisomorphic_2d=False, memory=15, testing=False, bath_gas=None):
         self.rmgdb = rmgdatabase
         self.restart_dict = restart_dict
         self.species_list = species_list
@@ -1567,9 +1568,10 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
                              conformer=conformer)
             elif 'memory' not in job.ess_trsh_methods:
                 # Increase memory allocation
-                memory = job.memory * 2
-                logging.info('Troubleshooting {type} job in {software} using memory: {mem} MB instead of {old} MB'.
-                             format(type=job_type, software=job.software, mem=memory, old=job.memory))
+                max_mem = servers[job.server].get('memory', 128)  # Node memory in GB, default to 128 if not specified
+                memory = job.memory_gb * 2 if job.memory_gb * 2 < max_mem * 0.9 else max_mem * 0.9
+                logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB instead of {old} GB'.
+                             format(type=job_type, software=job.software, mem=memory, old=job.memory_gb))
                 job.ess_trsh_methods.append('memory')
                 self.run_job(label=label, xyz=xyz, level_of_theory=level_of_theory, software=job.software,
                              job_type=job_type, fine=job.fine, memory=memory, ess_trsh_methods=job.ess_trsh_methods,
@@ -1678,11 +1680,9 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
                 add_mem = float(job.job_status[1].split()[-1])  # parse Molpro's requirement
                 add_mem = int(math.ceil(add_mem / 100.0)) * 100  # round up to the next hundred
                 add_mem += 250  # be conservative
-                memory = job.memory + add_mem
-                if memory < 5000:
-                    memory = 5000
-                logging.info('Troubleshooting {type} job in {software} using memory: {mw} MW'.format(
-                    type=job_type, software=job.software, mw=memory))
+                memory = job.memory_gb + add_mem / 128.  # convert MW to GB
+                logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB'.format(
+                    type=job_type, software=job.software, mem=memory))
                 self.run_job(label=label, xyz=xyz, level_of_theory=job.level_of_theory, software=job.software,
                              job_type=job_type, fine=job.fine, shift=job.shift, memory=memory,
                              ess_trsh_methods=job.ess_trsh_methods, conformer=conformer)
@@ -1717,9 +1717,9 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
             elif 'memory' not in job.ess_trsh_methods:
                 # Increase memory allocation, also run with a shift
                 job.ess_trsh_methods.append('memory')
-                memory = 5000
-                logging.info('Troubleshooting {type} job in {software} using memory: {mw} MW'.format(
-                    type=job_type, software=job.software, mw=memory))
+                memory = servers[job.server]['memory']  # set memory to the value of an entire node (in GB)
+                logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB'.format(
+                    type=job_type, software=job.software, mem=memory))
                 shift = 'shift,-1.0,-0.5;'
                 self.run_job(label=label, xyz=xyz, level_of_theory=job.level_of_theory, software=job.software,
                              job_type=job_type, fine=job.fine, shift=shift, memory=memory,

diff --git a/arc/settings.py b/arc/settings.py
@@ -45,7 +45,8 @@
         'address': 'server2.host.edu',
         'un': '<username>',
         'key': 'path_to_rsa_key',
-        'cpus': 48,  # optional (default: 8)
+        'cpus': 48,  # number of cpu's per node, optional (default: 8)
+        'memory': 128,  # amount of memory per node in GB, optional (default: 16)
     },
     'local': {
         'cluster_soft': 'OGE',