Skip to content

Commit

Permalink
PP-56: svr_clean_job_history can cause server quasihangs and spurious…
Browse files Browse the repository at this point in the history
… failovers
  • Loading branch information
prakashcv13 committed Jan 16, 2018
1 parent e6c9439 commit d42ddec
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/include/server.h
Expand Up @@ -250,6 +250,7 @@ enum failover_state {
* Server job history defines & globals
*/
#define SVR_CLEAN_JOBHIST_TM 120 /* after 2 minutes, reschedule the work task */
#define SVR_CLEAN_JOBHIST_SECS 5 /* never spend more than 5 seconds in one sweep to clean hist */
#define SVR_JOBHIST_DEFAULT 1209600 /* default time period to keep job history: 2 weeks */

/* function prototypes */
Expand Down
86 changes: 76 additions & 10 deletions src/server/svr_jobfunc.c
Expand Up @@ -93,6 +93,7 @@
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <netdb.h>
#include <signal.h>
#include <sys/types.h>
Expand Down Expand Up @@ -5236,16 +5237,22 @@ svr_clean_job_history(struct work_task *pwt)
job *nxpjob = (job *)0;
int walltime_used = 0;

/* set up another work task for next time period */
if (pwt && svr_history_enable) {
if (!set_task(WORK_Timed,
(time_now + SVR_CLEAN_JOBHIST_TM),
svr_clean_job_history, (void *)0)) {
log_err(errno,
"svr_clean_job_history",
"Unable to set task for clean job history");
}
}
/*
* Keep track of time spent purging jobs, interrupts purge if necessary.
* Timed task in nearby future set if purge took too long.
* Timed task in far future only set if this task completes.
* Autotunes time between job history purges:
* - raised if last purge was short
* - lowered if this purge needs to be interrupted
*/

time_t begin_time;
time_t end_time;
static time_t time_between_tasks = SVR_CLEAN_JOBHIST_TM;

begin_time = time(NULL);
/* Initialize end_time, in case we do not get into the while loop */
end_time = begin_time;

/*
* Traverse through the SERVER job list and find the history
Expand Down Expand Up @@ -5290,6 +5297,65 @@ svr_clean_job_history(struct work_task *pwt)
}
/* restore the saved next in pjob */
pjob = nxpjob;

/* check if we spent too long hogging the pbs_server process here */
end_time = time(NULL);
if ((end_time - begin_time) > SVR_CLEAN_JOBHIST_SECS) {
/* Apparently the interval between history purges is too long.
* reduce it using factor 0.7
*/
time_between_tasks = (floor((double)time_between_tasks * 0.7));

/* no use reducing to less than 4 * SVR_CLEAN_JOBHIST_SECS
* since we'll already schedule a continuation task here
*/
if (time_between_tasks < (4 * SVR_CLEAN_JOBHIST_SECS))
time_between_tasks = 4 * SVR_CLEAN_JOBHIST_SECS;

/* set up another work task in near future,
* but leave as much time as we spent in this routine for other work first
*/
if (!set_task(WORK_Timed,
(end_time + SVR_CLEAN_JOBHIST_SECS),
svr_clean_job_history, (void *)0)) {
log_err(errno,
"svr_clean_job_history",
"Unable to set task for clean job history");
/* on error to set task
* just continue purging the history
*/
} else
/* but if we managed to set a task in near future, return;
* that task will continue where we left off
*/
return;
}
} /* end of while loop through jobs */

/* We purged everything necessary in this task if we get here.
* set up another work task for next time period.
*/
if (pwt && svr_history_enable) {
if (!set_task(WORK_Timed,
(time_now + time_between_tasks),
svr_clean_job_history, (void *)0)) {
log_err(errno,
"svr_clean_job_history",
"Unable to set task for clean job history");
}
}

/* try to move the time between tasks up again
* but only if we spent less than 2/3rds of what we were allowed to spend
* Note the last purge of a chain of purges will often tend to undo part of the lowering
* in the earlier incomplete purges -- that's OK: 0.7*1.1 is still smaller than 1
*/

if ((time_between_tasks < SVR_CLEAN_JOBHIST_TM) &&
((end_time - begin_time) < floor((double)SVR_CLEAN_JOBHIST_SECS * 2/3))) {
time_between_tasks = ceil((double) time_between_tasks * 1.1);
if (time_between_tasks > SVR_CLEAN_JOBHIST_TM)
time_between_tasks = SVR_CLEAN_JOBHIST_TM;
}
}

Expand Down
105 changes: 105 additions & 0 deletions test/tests/performance/pbs_history_cleanup_quasihang.py
@@ -0,0 +1,105 @@
# coding: utf-8

# Copyright (C) 1994-2017 Altair Engineering, Inc.
# For more information, contact Altair at www.altair.com.
#
# This file is part of the PBS Professional ("PBS Pro") software.
#
# Open Source License Information:
#
# PBS Pro is free software. You can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Commercial License Information:
#
# The PBS Pro software is licensed under the terms of the GNU Affero General
# Public License agreement ("AGPL"), except where a separate commercial license
# agreement for PBS Pro version 14 or later has been executed in writing with
# Altair.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software - under
# a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.

from tests.performance import *


class HistoryCleanupQuasihang(TestPerformance):
"""
This test suite aims at testing the quasihang caused by a lot of jobs
in the history.
Without the fix, the server takes a lot of time to respond to a client.
With the fix, the amount of time is significantly reduced.
"""

def setUp(self):
TestPerformance.setUp(self)

a = {'job_history_enable': 'True', "job_history_duration": "10:00:00"}
self.server.manager(MGR_CMD_SET, SERVER, a)

a = {ATTR_rescavail + '.ncpus': 100}
self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname)

@timeout(18000)
def test_time_for_stat_during_history_cleanup(self):
"""
This test case submits 50k very short jobs so that the job history
has a lot of jobs in it.
After submitting the jobs, the job_history_duration is reduced so
that the server could start purging the job history.
Another job is submitted and stat-ed. The test case then finds the
amount of time the server takes to respond.
The test case is not designed to pass/fail on builds with/without
the fix.
"""
test = ['echo test\n']
# Submit a lot of jobs.
for _ in range(0, 10):
for _ in range(0, 10):
for _ in range(0, 500):
j = Job(TEST_USER, attrs={ATTR_k: 'oe'})
j.create_script(body=test)
jid = self.server.submit(j)
time.sleep(1)
self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x',
offset=10, interval=2, max_attempts=100)

a = {"job_history_duration": "00:00:05"}
self.server.manager(MGR_CMD_SET, SERVER, a)

j = Job(TEST_USER)
j.set_sleep_time(10000)
jid = self.server.submit(j)
self.server.expect(JOB, {'job_state': 'R'}, id=jid)

now1 = 0
now2 = 0
i = 0

while now2 - now1 < 2 and i < 125:
time.sleep(1)
now1 = int(time.time())
self.server.expect(JOB, {'job_state': 'R'}, id=jid)
now2 = int(time.time())
i += 1

self.logger.info("qstat took %d seconds to return\n",
(now2 - now1))

0 comments on commit d42ddec

Please sign in to comment.