Skip to content

Commit

Permalink
PBS Server Update (#640)
Browse files Browse the repository at this point in the history
This PR ensures that if Max Server requests are reached, then the
software will sleep for 5 minutes and then attempt to resubmit again
It also fixes the issue of PBS memory setting
furthermore, if the user provides memory in as a string, such as '34G'
then the software will allow it
  • Loading branch information
kfir4444 committed Apr 17, 2023
2 parents c0caa90 + 010848e commit 38c4ba3
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 3 deletions.
4 changes: 2 additions & 2 deletions arc/job/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def write_submit_script(self) -> None:
name=self.job_server_name,
un=servers[self.server]['un'],
t_max=self.format_max_job_time(time_format=t_max_format[servers[self.server]['cluster_soft']]),
memory=int(self.submit_script_memory),
memory=int(self.submit_script_memory) if isinstance(self.submit_script_memory, int) else self.submit_script_memory,
cpus=self.cpu_cores,
architecture=architecture,
max_task_num=self.workers,
Expand Down Expand Up @@ -764,7 +764,7 @@ def set_cpu_and_mem(self):
self.submit_script_memory = math.ceil(total_submit_script_memory) # in MB
if cluster_software in ['pbs']:
# In PBS, "#PBS -l select=1:ncpus=8:mem=12000000" specifies the memory for all cores to be 12 MB.
self.submit_script_memory = math.ceil(total_submit_script_memory) * 1E8 # in Bytes
self.submit_script_memory = math.ceil(total_submit_script_memory) * 1E6 # in Bytes
elif cluster_software in ['slurm']:
# In Slurm, "#SBATCH --mem-per-cpu=2000" specifies the memory **per cpu/thread** to be 2000 MB.
self.submit_script_memory = math.ceil(total_submit_script_memory / self.cpu_cores) # in MB
Expand Down
2 changes: 1 addition & 1 deletion arc/job/adapter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def test_set_cpu_and_mem(self):
self.job_4.server = 'server3'
self.job_4.cpu_cores = None
self.job_4.set_cpu_and_mem()
expected_memory = math.ceil(14 * 1024 * 1.1) * 1E8
expected_memory = math.ceil(14 * 1024 * 1.1) * 1E6
self.assertEqual(self.job_4.submit_script_memory, expected_memory)
self.job_4.server = 'local'

Expand Down
9 changes: 9 additions & 0 deletions arc/job/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,15 @@ def submit_job(path: str,
submit_filename=submit_filename,
recursion=True,
)
if cluster_soft.lower() == 'pbs' and any('qsub: would exceed' in err_line for err_line in stderr):
logger.warning(f'Max number of submitted jobs was reached, sleeping...')
time.sleep(5 * 60)
submit_job(path=path,
cluster_soft=cluster_soft,
submit_cmd=submit_cmd,
submit_filename=submit_filename,
recursion=True,
)
if not len(stdout) or recursion:
return None, None
if len(stderr) > 0 or len(stdout) == 0:
Expand Down

0 comments on commit 38c4ba3

Please sign in to comment.