Skip to content

Commit

Permalink
PP-250: Hook debug causes file descriptor leak that crashes PBS server
Browse files Browse the repository at this point in the history
  • Loading branch information
bayucan committed Jul 15, 2016
1 parent a7cf99b commit a691252
Show file tree
Hide file tree
Showing 2 changed files with 233 additions and 3 deletions.
96 changes: 93 additions & 3 deletions src/server/hook_func.c
Expand Up @@ -3581,10 +3581,32 @@ do_runjob_reject_actions(job *pjob, char *hook_name)
static void
write_hook_reject_debug_output_and_close(char *reject_msg)
{
char *hook_outfile;
FILE *fp_debug_out = NULL;

fp_debug_out = pbs_python_get_hook_debug_output_fp();

if ( fp_debug_out == NULL) {
/* prepare to open file if output file pointer not stored */
hook_outfile = pbs_python_get_hook_debug_output_file();
if ((hook_outfile != NULL) && (hook_outfile[0] != '\0')) {
/* need to open in append mode, as */
/* process_hooks() may have */
/* already written into this file. */
fp_debug_out = fopen(hook_outfile, "a");
if (fp_debug_out == NULL) {
snprintf(log_buffer, sizeof(log_buffer),
"warning: open of hook debug output file %s failed!",
hook_outfile);
log_err(-1,
"write_hook_reject_output_and_close",
log_buffer);
} else {
pbs_python_set_hook_debug_output_fp(fp_debug_out);
}
}
}

if (fp_debug_out != NULL) {
fprintf(fp_debug_out, "%s=True\n",
EVENT_REJECT_OBJECT);
Expand Down Expand Up @@ -3612,10 +3634,32 @@ write_hook_reject_debug_output_and_close(char *reject_msg)
static void
write_hook_accept_debug_output_and_close(void)
{
char *hook_outfile;
FILE *fp_debug_out = NULL;

fp_debug_out = pbs_python_get_hook_debug_output_fp();

if ( fp_debug_out == NULL) {
/* prepare to open file if output file pointer not stored */
hook_outfile = pbs_python_get_hook_debug_output_file();
if ((hook_outfile != NULL) && (hook_outfile[0] != '\0')) {
/* need to open in append mode, as */
/* process_hooks() may have */
/* already written into this file. */
fp_debug_out = fopen(hook_outfile, "a");
if (fp_debug_out == NULL) {
snprintf(log_buffer, sizeof(log_buffer),
"warning: open of hook debug output file %s failed!",
hook_outfile);
log_err(-1,
"write_hook_accept_output_and_close",
log_buffer);
} else {
pbs_python_set_hook_debug_output_fp(fp_debug_out);
}
}
}

if (fp_debug_out != NULL) {
fprintf(fp_debug_out, "%s=True\n",
EVENT_ACCEPT_OBJECT);
Expand Down Expand Up @@ -3677,6 +3721,7 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
FILE *fp_debug = NULL;
FILE *fp2_debug = NULL;
FILE *fp_debug_out = NULL;
FILE *fp_debug_out_save = NULL;
char hook_inputfile[MAXPATHLEN+1];
char hook_datafile[MAXPATHLEN+1];
char hook_outfile[MAXPATHLEN+1];
Expand Down Expand Up @@ -3741,6 +3786,10 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,

suffix_sz = strlen(HOOK_SCRIPT_SUFFIX);

/* initialize various hook_debug_* instance */
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");

for (phook = (hook *)GET_NEXT(*head_ptr); phook; phook = phook_next) {

if (preq->rq_type == PBS_BATCH_QueueJob) {
Expand Down Expand Up @@ -3853,7 +3902,8 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
default:
do_recreate = 0;
}
if (do_recreate) {
if (do_recreate) {
fp_debug_out_save = pbs_python_get_hook_debug_output_fp();
pbs_python_set_hook_debug_output_fp(fp_debug);
/* recreate_request() appends */
/* pbs.event().job or */
Expand All @@ -3864,7 +3914,7 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
/* written into the file represented */
/* by 'fp_debug'. */
(void)recreate_request(temp_req);
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_fp(fp_debug_out_save);
}
free_br(temp_req);
} else {
Expand Down Expand Up @@ -3895,6 +3945,12 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
pbs_python_set_hook_debug_data_fp(NULL);
pbs_python_set_hook_debug_data_file("");
}
if (fp_debug_out != NULL) {
fclose(fp_debug_out);
fp_debug_out = NULL;
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");
}
return (-1);
}

Expand All @@ -3920,6 +3976,12 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
pbs_python_set_hook_debug_data_fp(NULL);
pbs_python_set_hook_debug_data_file("");
}
if (fp_debug_out != NULL) {
fclose(fp_debug_out);
fp_debug_out = NULL;
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");
}
return (-1);
}
set_alarm(phook->alarm, pyinter_func);
Expand Down Expand Up @@ -3994,6 +4056,13 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
pbs_python_set_hook_debug_data_fp(NULL);
pbs_python_set_hook_debug_data_file("");
}

if (fp_debug_out != NULL) {
fclose(fp_debug_out);
fp_debug_out = NULL;
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");
}
return (-1);
}
}
Expand Down Expand Up @@ -4038,11 +4107,26 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
pbs_python_set_hook_debug_output_file(hook_outfile);
fp_debug_out = fopen(hook_outfile, "w");
if (fp_debug_out != NULL) {
fp_debug_out_save = pbs_python_get_hook_debug_output_fp();
if (fp_debug_out_save != NULL) {
fclose(fp_debug_out_save);
}
pbs_python_set_hook_debug_output_fp(fp_debug_out);
}
} else {
fp_debug_out_save = pbs_python_get_hook_debug_output_fp();
if (fp_debug_out_save != NULL) {
fclose(fp_debug_out_save);
}
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");
/* NOTE: don't call */
/* pbs_python_set_hook_debug_output_file() as */
/* we still need a file to dump any remaining */
/* debug output in case all hooks end */
/* up accepting the current event with some */
/* hooks with debug=true and some that are */
/* debug=false */

}

if (fp2_debug != NULL) {
Expand All @@ -4069,6 +4153,12 @@ process_hooks(struct batch_request *preq, char *hook_msg, size_t msg_len,
log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
LOG_ERR, phook->hook_name,
"Internal server error encountered. Skipping hook.");
if (fp_debug_out != NULL) {
fclose(fp_debug_out);
fp_debug_out = NULL;
pbs_python_set_hook_debug_output_fp(NULL);
pbs_python_set_hook_debug_output_file("");
}
return (-1); /* should not happen */
case -2: /* unhandled exception */
pbs_python_event_reject(NULL);
Expand Down
140 changes: 140 additions & 0 deletions test/tests/pbs_hook_debug_nocrash.py
@@ -0,0 +1,140 @@
# coding: utf-8

# Copyright (C) 1994-2016 Altair Engineering, Inc.
# For more information, contact Altair at www.altair.com.
#
# This file is part of the PBS Professional ("PBS Pro") software.
#
# Open Source License Information:
#
# PBS Pro is free software. You can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Commercial License Information:
#
# The PBS Pro software is licensed under the terms of the GNU Affero General
# Public License agreement ("AGPL"), except where a separate commercial license
# agreement for PBS Pro version 14 or later has been executed in writing with Altair.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and distribute
# them - whether embedded or bundled with other software - under a commercial
# license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.

from ptl.utils.pbs_testsuite import *


class TestHookDebugNoCrash(PBSTestSuite):

"""
This tests to make sure the following does not occur:
Hook debug causes file descriptor leak that crashes PBS server
PRE: Have 3 queuejob hooks, qjob1, qjob2, qjob3 with order=1, order=2,
order=3 respectively. qjob1 and qjob2 have debug=True while
order=3 has debug=False. Try submitting 1000 jobs.
POST: On a fixed PBS, this test case will run to completion.
On a PBS containing the bug, the test could fail on a server crash,
a failure in qsub with "Invalid credential", or even a qstat
hang with ptl returning:
corretja: /opt/pbs/bin/qstat -f 4833.corretja
2016-07-08 12:56:52,799 INFO TIMEDOUT
and server_logs having the message "Too many open files".
This is because a previous bug causes pbs_server to not close the
debug output file descriptors opened by subsequent hook executions.
NOTE: This is assuming on one's local system, we have the
follwoing limit:
# ulimit -a
...
open files (-n) 1024
"""

# Class variables
open_files_limit_expected = 1024

def setUp(self):
ret = self.du.run_cmd(
self.server.hostname, [
'ulimit', '-n'], sudo=True, as_script=True, logerr=False)
self.assertEqual(ret['rc'], 0)
open_files_limit = ret['out'][0]
if (open_files_limit == "unlimited") or (
int(open_files_limit) > self.open_files_limit_expected):
self.skipTest(
"\n'This test requires 'open files' system limit to be <= %d (current value=%s)." %
(self.open_files_limit_expected, open_files_limit))
PBSTestSuite.setUp(self)

def test_hook_debug_no_crash(self):

hook_body = """
import pbs
e=pbs.event()
pbs.logmsg(pbs.LOG_DEBUG, "hook %s executed" % (e.hook_name,))
"""
hook_name = "qjob1"
a = {
'event': "queuejob",
'enabled': 'True',
'debug': 'True',
'order': 1}
rv = self.server.create_import_hook(
hook_name,
a,
hook_body,
overwrite=True)
self.assertTrue(rv)

hook_name = "qjob2"
a = {
'event': "queuejob",
'enabled': 'True',
'debug': 'True',
'order': 2}
rv = self.server.create_import_hook(
hook_name,
a,
hook_body,
overwrite=True)
self.assertTrue(rv)

hook_name = "qjob3"
a = {
'event': "queuejob",
'enabled': 'True',
'debug': 'False',
'order': 2}
rv = self.server.create_import_hook(
hook_name,
a,
hook_body,
overwrite=True)
self.assertTrue(rv)

self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'},
expect=True)

for i in range(1000):
j = Job(TEST_USER)
a = {
'Resource_List.select': '1:ncpus=1',
'Resource_List.walltime': 3600}
j.set_attributes(a)
j.set_sleep_time("5")
jid = self.server.submit(j)
self.server.expect(JOB, {'job_state': 'Q'}, id=jid)

0 comments on commit a691252

Please sign in to comment.