Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
11119 lines (9982 sloc) 277 KB
/*
* Copyright (C) 1994-2019 Altair Engineering, Inc.
* For more information, contact Altair at www.altair.com.
*
* This file is part of the PBS Professional ("PBS Pro") software.
*
* Open Source License Information:
*
* PBS Pro is free software. You can redistribute it and/or modify it under the
* terms of the GNU Affero General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any
* later version.
*
* PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE.
* See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Commercial License Information:
*
* For a copy of the commercial license terms and conditions,
* go to: (http://www.pbspro.com/UserArea/agreement.html)
* or contact the Altair Legal Department.
*
* Altair’s dual-license business model allows companies, individuals, and
* organizations to create proprietary derivative works of PBS Pro and
* distribute them - whether embedded or bundled with other software -
* under a commercial license agreement.
*
* Use of Altair’s trademarks, including but not limited to "PBS™",
* "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
* trademark licensing policies.
*
*/
/**
* @file mom_main.c
* @brief
* The entry point function for MOM.
*/
#include <pbs_config.h> /* the master config generated by configure */
#ifdef PYTHON
#include <Python.h>
#endif
#ifdef WIN32
#include <sys/timeb.h>
#else
#include <unistd.h>
#include <pwd.h>
#include <grp.h>
#include <netdb.h>
#include <sys/param.h>
#include <sys/times.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#ifdef _POSIX_MEMLOCK
#include <sys/mman.h>
#endif /* _POSIX_MEMLOCK */
#include <dirent.h>
#endif /* WIN32 */
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "libpbs.h"
#include "pbs_ifl.h"
#include "server_limits.h"
#include "list_link.h"
#include "attribute.h"
#include "placementsets.h"
#include "resource.h"
#include "job.h"
#include "mom_func.h"
#include "pbs_nodes.h"
#include "svrfunc.h"
#include "pbs_error.h"
#include "log.h"
#include "net_connect.h"
#include "rpp.h"
#include "dis.h"
#include "dis_init.h"
#include "resmon.h"
#include "batch_request.h"
#include "pbs_license.h"
#include "pbs_version.h"
#include "libsec.h"
#include "pbs_ecl.h"
#include "pbs_internal.h"
#if defined(MOM_CPUSET)
#include "mom_vnode.h"
#endif /* MOM_CPUSET */
#include "avltree.h"
#ifndef WIN32
#ifndef NAS /* localmod 113 */
#include "hwloc.h"
#endif /* localmod 113 */
#endif
#include "hook.h"
#include "mom_hook_func.h"
#include "work_task.h"
#include "pbs_share.h"
#include "mom_server.h"
#if MOM_CSA || MOM_ALPS
#include "mom_mach.h"
#endif /* MOM_CSA or MOM_ALPS */
#include "pbs_reliable.h"
#define STATE_UPDATE_TIME 10
#ifndef PRIO_MAX
#define PRIO_MAX 20
#endif
#ifndef PRIO_MIN
#define PRIO_MIN -20
#endif
/*
* Default "mutual exclusion" for job start/queue commit operations. The
* pointer is provided so multi-threaded mom implementations can replace it
* with a pointer to a shared mutex.
*/
#if IRIX6_CPUSET == 1
#include "pbs_mutex.h"
#include "cpusets.h"
static pbs_mutex pbs_commit_mtx;
volatile pbs_mutex *pbs_commit_ptr = &pbs_commit_mtx;
#endif
/* Global Data Items */
enum hup_action call_hup = HUP_CLEAR;
static int update_state_flag = 0;
double cputfactor = 1.00;
unsigned int default_server_port;
int exiting_tasks = 0;
float ideal_load_val = -1.0;
int idle_on_maxload = 0;
int internal_state = 0;
int internal_state_update = 0;
int termin_child = 0;
int do_debug_report = 0;
uid_t restrict_user_exempt_uids[NUM_RESTRICT_USER_EXEMPT_UIDS] = {0};
int svr_delay_entry = 0;
int mom_net_up = 0;
time_t mom_net_up_time = 0;
#ifdef WIN32
LASTINPUTINFO key_mouse_press = { sizeof(LASTINPUTINFO), 0 };
int nrun_factor = 0;
void WINAPI PbsMomMain(DWORD dwArgc, LPTSTR *rgszArgv);
void WINAPI PbsMomHandler(DWORD dwControl);
DWORD WINAPI main_thread(void *pv);
/*
* NOTE: Note the global state used by your service. Your service has a name,
* state and a status handle used by SetServiceStatus.
*/
const TCHAR * const g_PbsMomName = __TEXT("PBS_MOM");
HANDLE g_hthreadMain = 0;
SERVICE_STATUS_HANDLE g_ssHandle = 0;
DWORD g_dwCurrentState = SERVICE_START_PENDING;
HANDLE hStop = NULL;
#else
char *mom_domain;
#endif /* WIN32 */
extern void mom_vnlp_report(vnl_t *vnl, char *header);
int alien_attach = 0; /* attach alien procs */
int alien_kill = 0; /* kill alien procs */
#if defined(MOM_CPUSET) && (CPUSET_VERSION >= 4)
char *cpuset_error_action = "offline";
#endif /* MOM_CPUSET && CPUSET_VERSION >= 4 */
int lockfds;
float max_load_val = -1.0;
int max_poll_downtime_val = PBS_MAX_POLL_DOWNTIME;
char *mom_domain;
char *mom_home;
char mom_host[PBS_MAXHOSTNAME+1];
pid_t mom_pid;
int mom_run_state = 1;
char mom_short_name[PBS_MAXHOSTNAME+1];
int next_sample_time = MAX_CHECK_POLL_TIME;
int max_check_poll = MAX_CHECK_POLL_TIME;
int min_check_poll = MIN_CHECK_POLL_TIME;
int inc_check_poll = 20;
int num_acpus = 1;
int num_pcpus = 1;
int num_oscpus = 1;
u_Long av_phy_mem = 0; /* physical memory in KB */
int num_var_env;
char *path_epilog;
char *path_jobs;
char *path_prolog;
char *path_spool;
char *path_undeliv;
char *path_addconfigs;
char path_addconfigs_reserved_prefix[] = "PBS";
char *path_hooks;
char *path_hooks_workdir;
char *path_rescdef;
hook *phook;
char *hook_suffix = HOOK_FILE_SUFFIX;
int hook_suf_len;
char hook_msg[HOOK_MSG_SIZE+1];
int baselen;
char *psuffix;
struct dirent *pdirent;
DIR *dir;
/*char pbs_current_user[PBS_MAXUSER] = "pbs_mom";*/ /* for libpbs.a */
/* above is TLS data now, strcpy the value "pbs_mom" into it in main */
#ifdef WIN32
char pbs_tmpdir[MAX_PATH] = TMP_DIR;
char pbs_jobdir_root[MAX_PATH]= "";
#else
char pbs_tmpdir[_POSIX_PATH_MAX] = TMP_DIR;
char pbs_jobdir_root[_POSIX_PATH_MAX]= "";
#endif
vnl_t *vnlp = NULL; /* vnode list */
unsigned long hooks_rescdef_checksum = 0;
/* vnlp_from_hook: vnode list changes made by an exechost_startup hook, that */
/* sent to the server initially as part of the IS_HELLO/IS_CLUSTER_ADDR/ */
/* IS_CLUSTER_ADDR2 sequence . Then when successfully sent to server, */
/* entries matching HOOK_VNL_PERSISTENT_ATTRIBS will be merged with the */
/* main vnlp structure, which gets resent when server loses contact */
/* with mom, and server sends an IS_HELLO request */
vnl_t *vnlp_from_hook = NULL;
extern char *msg_startup1;
extern char *msg_init_chdir;
extern char *msg_corelimit;
int pbs_errno;
gid_t pbsgroup;
unsigned int pbs_mom_port;
unsigned int pbs_rm_port;
pbs_list_head mom_polljobs; /* jobs that must have resource limits polled */
pbs_list_head mom_deadjobs; /* jobs that need to purged, see chk_del_job */
int server_stream = -1;
pbs_list_head svr_newjobs; /* jobs being sent to MOM */
pbs_list_head svr_alljobs; /* all jobs under MOM's control */
time_t time_last_sample = 0;
extern time_t time_now;
time_t time_resc_updated = 0;
extern pbs_list_head svr_requests;
extern struct var_table vtable; /* see start_exec.c */
#if MOM_ALPS
#define ALPS_REL_WAIT_TIME_DFLT 400000; /* 0.4 sec */
#define ALPS_REL_JITTER_DFLT 120000; /* 0.12 sec */
#define ALPS_REL_TIMEOUT 600; /* 10 min */
#define ALPS_CONF_EMPTY_TIMEOUT 10; /* 10 sec */
#define ALPS_CONF_SWITCH_TIMEOUT 35; /* 35 sec */
char *alps_client = NULL;
useconds_t alps_release_wait_time = ALPS_REL_WAIT_TIME_DFLT;
useconds_t alps_release_jitter = ALPS_REL_JITTER_DFLT;
int vnode_per_numa_node;
int alps_release_timeout;
int alps_confirm_empty_timeout;
int alps_confirm_switch_timeout;
#endif /* MOM_ALPS */
char *path_checkpoint = NULL;
static resource_def *rdcput;
static resource_def *rdwall;
int restart_background = FALSE;
int reject_root_scripts = FALSE;
int report_hook_checksums = TRUE;
int restart_transmogrify = FALSE;
int attach_allow = TRUE;
extern double wallfactor;
int suspend_signal;
int resume_signal;
int cycle_harvester = 0; /* MOM configured for cycle harvesting */
int restrict_user = 0; /* kill non PBS user procs */
int restrict_user_maxsys = 999; /* largest system user id */
int vnode_additive = 1;
momvmap_t **mommap_array = NULL;
int mommap_array_size = 0;
unsigned long QA_testing = 0;
long joinjob_alarm_time = -1;
long job_launch_delay = -1; /* # of seconds to delay job launch due to pipe reads (pipe read timeout) */
int update_joinjob_alarm_time = 0;
int update_job_launch_delay = 0;
#ifdef NAS /* localmod 015 */
unsigned long spoolsize = 0; /* default spoolsize = unlimited */
#endif /* localmod 015 */
#ifdef NAS /* localmod 153 */
static char quiesce_mom_flag_file[_POSIX_PATH_MAX] = "/PBS/flags/quiesce_mom";
int mom_should_quiesce = 0;
#endif /* localmod 153 */
#ifdef NAS_UNKILL /* localmod 011 */
#define KP_WAIT_TIME 60 /* number of seconds to wait for kill
to do its deed before declaring the
process unkillable */
struct kp {
pbs_list_link kp_link; /* linked list struct */
pid_t pid; /* pid of process being killed */
pid_t ppid; /* ppid of process being killed */
u_Long start_time; /* start_time of process being killed */
time_t kill_time; /* time() of first kill attempt */
};
typedef struct kp kp;
pbs_list_head killed_procs; /* procs killed by dorestrict_user() */
#endif /* localmod 011 */
unsigned long hook_action_id = 0;
pbs_list_head svr_allhooks;
/* hooks below ignored */
pbs_list_head svr_queuejob_hooks;
pbs_list_head svr_modifyjob_hooks;
pbs_list_head svr_resvsub_hooks;
pbs_list_head svr_movejob_hooks;
pbs_list_head svr_runjob_hooks;
pbs_list_head svr_periodic_hooks;
pbs_list_head svr_provision_hooks;
pbs_list_head svr_resv_end_hooks;
pbs_list_head svr_hook_job_actions;
pbs_list_head svr_hook_vnl_actions;
int svr_hook_resend_job_attrs = 0;
int mom_recvd_ip_cluster_addrs = 0;
/* the mom hooks */
pbs_list_head svr_execjob_begin_hooks;
pbs_list_head svr_execjob_prologue_hooks;
pbs_list_head svr_execjob_epilogue_hooks;
pbs_list_head svr_execjob_preterm_hooks;
pbs_list_head svr_execjob_launch_hooks;
pbs_list_head svr_execjob_end_hooks;
pbs_list_head svr_exechost_periodic_hooks;
pbs_list_head svr_exechost_startup_hooks;
pbs_list_head svr_execjob_attach_hooks;
pbs_list_head svr_execjob_resize_hooks;
pbs_list_head svr_execjob_abort_hooks;
pbs_list_head svr_execjob_postsuspend_hooks;
pbs_list_head svr_execjob_preresume_hooks;
/* the task lists */
pbs_list_head task_list_immed;
pbs_list_head task_list_timed;
pbs_list_head task_list_event;
#ifdef WIN32
/* copy request list */
pbs_list_head mom_copyreqs_list;
#endif
#ifndef WIN32
#ifdef RLIM64_INFINITY
struct rlimit64 orig_stack_size;
struct rlimit64 orig_nproc_limit;
struct rlimit64 orig_core_limit;
#else
struct rlimit orig_stack_size;
struct rlimit orig_nproc_limit;
struct rlimit orig_core_limit;
#endif /* RLIM64... */
#endif /* WIN32 */
/* Local Data Items */
static int nconfig; /* items in conf file */
static time_t idle_avail = 0; /* seconds for keyboard to be idle */
static time_t idle_busy = 10; /* seconds for keyboard to remain */
static int idle_check = -1; /* indicate if doing idle check */
static time_t idle_poll = 1; /* rate to poll keyboard when ! busy */
static time_t went_busy = 0; /* time keyboard went busy */
static time_t prior_key = 0; /* time of prior keystroke/mouse */
static int restrictrm = 0; /* restricted RM request */
static int kill_jobs_on_exit = 0; /* kill running jobs on Mom exit */
static char *path_checkpoint_from_getopt = NULL;
static char *path_checkpoint_from_getenv = NULL;
static char *path_checkpoint_default = NULL;
#ifdef WIN32
static char path_checkpoint_buf[MAX_PATH] = "\0";
#else
static char path_checkpoint_buf[_POSIX_PATH_MAX] = "\0";
#endif
static time_t maxtm; /* see getkbdtime() */
#ifdef WIN32
#define IDLE_POLL_BUFSIZE 512
/**
* This global variable is used to store name of pbs interative service
* which is used to control the pbs interactive service (i.e. start/stop/check)
*/
const TCHAR *const g_PbsInteractiveName = __TEXT("PBS_INTERACTIVE");
/**
* This global variable is used to store PBS_INTERACTIVE Service Control Handle
* which is used to control the pbs interactive service (i.e. start/stop/check)
*/
SC_HANDLE schPbsInteractive = INVALID_HANDLE_VALUE;
/**
* This global variable is used to indicate whether PBS_INTERACTIVE service
* has been registered with Service Control Manager
*
* 0 - Error
* 1 - PBS_INTERACTIVE service is registered
* -1 - PBS_INTERACTIVE service is not registered
*/
int interactive_svc_avail = 0;
#endif
/**
* To handle new configuration file formats (beginning with the vnode-
* specific data needed for GRUNT), we introduce the notion that any
* new-style mom configuration file must declare its version number at
* the beginning of the file, via a "$configversion" directive.
*
* At present, we handle only a single new version number, known internally
* as CONFIG_VNODEVERS.
*/
enum configvers {
CONFIG_MINVERS = 2,
CONFIG_VNODEVERS = 2,
CONFIG_MAXVERS = 2
};
struct config_list {
struct config c;
struct config_list *c_link;
};
static handler_ret_t config_versionhandler(char *, const char *, FILE *);
static handler_ret_t addclient(char *);
static handler_ret_t add_mom_action(char *);
static handler_ret_t config_verscheck(char *);
static handler_ret_t cputmult(char *);
#if defined(MOM_CPUSET) && (CPUSET_VERSION >= 4)
static handler_ret_t set_cpuset_error_action(char *);
#endif /* MOM_CPUSET && CPUSET_VERSION >= 4 */
static handler_ret_t parse_config(char *);
static handler_ret_t prologalarm(char *);
static handler_ret_t set_joinjob_alarm(char *);
static handler_ret_t set_job_launch_delay(char *);
static handler_ret_t restricted(char *);
static handler_ret_t set_alien_attach(char *);
static handler_ret_t set_alien_kill(char *);
#if MOM_ALPS
static handler_ret_t set_alps_client(char *);
static handler_ret_t set_alps_release_wait_time(char *);
static handler_ret_t set_alps_release_jitter(char *);
static handler_ret_t set_alps_release_timeout(char *);
static handler_ret_t set_alps_confirm_empty_timeout(char *);
static handler_ret_t set_vnode_per_numa_node(char *);
static handler_ret_t set_alps_confirm_switch_timeout(char *);
#endif /* MOM_ALPS */
static handler_ret_t set_attach_allow(char *);
static handler_ret_t set_checkpoint_path(char *);
static handler_ret_t set_enforcement(char *);
static handler_ret_t set_jobdir_root(char *);
static handler_ret_t set_kbd_idle(char *);
static handler_ret_t set_max_check_poll(char *);
static handler_ret_t set_min_check_poll(char *);
static handler_ret_t set_momname(char *);
static handler_ret_t set_momport(char *);
#ifdef WIN32
static handler_ret_t set_nrun_factor(char *);
#endif
static handler_ret_t set_restart_background(char *);
static handler_ret_t set_restart_transmogrify(char *);
static handler_ret_t set_restrict_user(char *);
static handler_ret_t set_restrict_user_maxsys(char *);
static handler_ret_t set_restrict_user_exceptions(char *);
static handler_ret_t set_suspend_signal(char *);
static handler_ret_t set_tmpdir(char *);
static handler_ret_t set_vnode_additive(char *);
static handler_ret_t setidealload(char *);
static handler_ret_t setlogevent(char *);
static handler_ret_t set_reject_root_scripts(char *);
static handler_ret_t set_report_hook_checksums(char *);
static handler_ret_t setmaxload(char *);
static handler_ret_t set_max_poll_downtime(char *);
#if MOM_BGL
static handler_ret_t set_bgl_reserve_partitions(char *);
#endif /* MOM_BGL */
static handler_ret_t usecp(char *);
static handler_ret_t wallmult(char *);
#ifdef NAS /* localmod 015 */
static handler_ret_t set_spoolsize(char *);
#endif /* localmod 015 */
#if defined(__sgi)
extern handler_ret_t set_checkpoint_upgrade(char *);
#endif /* __sgi */
static struct specials {
char *name;
handler_ret_t (*handler)(char *);
} special[] = {
/* alphabetized by name */
{ "action", add_mom_action },
/*
****************************************************
** WARNING
** These "alien" entries are undocumented and are for
** prototype purposes only. DO NOT USE.
****************************************************
*/
{ "alien_attach", set_alien_attach },
{ "alien_kill", set_alien_kill },
#if MOM_ALPS
{ "alps_client", set_alps_client },
{ "alps_confirm_empty_timeout", set_alps_confirm_empty_timeout },
{ "alps_release_wait_time", set_alps_release_wait_time },
{ "alps_release_jitter", set_alps_release_jitter },
{ "alps_release_timeout", set_alps_release_timeout },
{ "vnode_per_numa_node", set_vnode_per_numa_node },
{ "alps_confirm_switch_timeout",set_alps_confirm_switch_timeout },
#endif /* MOM_ALPS */
{ "attach_allow", set_attach_allow },
#if MOM_BGL
{ "bgl_reserve_partitions", set_bgl_reserve_partitions },
#endif /* MOM_BGL */
{ "checkpoint_path", set_checkpoint_path },
#if defined(__sgi)
{ "checkpoint_upgrade", set_checkpoint_upgrade },
#endif /* __sgi */
{ "clienthost", addclient },
{ "configversion", config_verscheck },
{ "cputmult", cputmult },
#if defined(MOM_CPUSET) && (CPUSET_VERSION >= 4)
{ "cpuset_error_action", set_cpuset_error_action },
#endif /* MOM_CPUSET && CPUSET_VERSION >= 4 */
{ "enforce", set_enforcement },
{ "ideal_load", setidealload },
{ "jobdir_root", set_jobdir_root },
{ "kbd_idle", set_kbd_idle },
{ "logevent", setlogevent },
{ "max_check_poll", set_max_check_poll },
{ "max_load", setmaxload },
{ "max_poll_downtime", set_max_poll_downtime },
{ "min_check_poll", set_min_check_poll },
{ "momname", set_momname },
#ifdef WIN32
{ "nrun_factor", set_nrun_factor },
#endif
{ "port", set_momport },
{ "prologalarm", prologalarm },
{ "sister_join_job_alarm", set_joinjob_alarm },
{ "job_launch_delay", set_job_launch_delay },
{ "restart_background", set_restart_background },
{ "restart_transmogrify", set_restart_transmogrify },
{ "restrict_user", set_restrict_user },
{ "restrict_user_exceptions", set_restrict_user_exceptions },
{ "restrict_user_maxsysid", set_restrict_user_maxsys },
{ "restricted", restricted },
#ifdef NAS /* localmod 015 */
/*
* spool size limit
*/
{ "spool_size", set_spoolsize },
#endif /* localmod 015 */
{ "suspendsig", set_suspend_signal },
{ "tmpdir", set_tmpdir },
{ "vnodedef_additive", set_vnode_additive },
{ "usecp", usecp },
{ "wallmult", wallmult },
{ "reject_root_scripts", set_reject_root_scripts },
{ "report_hook_checksums", set_report_hook_checksums },
{ NULL, NULL }
};
static struct specials addspecial[] = {
{ NULL, NULL }
};
char *log_file = NULL;
char *path_log;
char *ret_string;
int ret_size;
struct config *config_array = NULL;
struct config_list *config_list = NULL;
#ifndef WIN32
sigset_t allsigs;
#endif
int rm_errno;
unsigned int reqnum = 0; /* the packet number */
int port_care = 1; /* secure connecting ports */
uid_t uid = 0; /* uid we are running with */
int alarm_time = 10; /* time before alarm */
int nice_val = 0; /* nice daemon by this much */
char **maskclient = NULL; /* wildcard connections */
int mask_num = 0;
int mask_max = 0;
u_long localaddr = 0;
char extra_parm[] = "extra parameter(s)";
char no_parm[] = "required parameter not found";
int cphosts_num = 0;
struct cphosts *pcphosts = 0;
int enable_exechost2 = 0;
static int config_file_specified = 0;
#ifdef WIN32
static char config_file[MAX_PATH] = "config";
#else
static char config_file[_POSIX_PATH_MAX] = "config";
#endif
struct mom_action mom_action[(int)LastAction] = {
{ "terminate", 0, Default, NULL, NULL },
{ "checkpoint", 0, Default, NULL, NULL },
{ "checkpoint_abort", 0, Default, NULL, NULL },
{ "restart", 0, Default, NULL, NULL },
{ "multinodebusy", 0, Default, NULL, NULL }
};
/*
** These routines are in the "dependent" code.
*/
extern void dep_initialize(void);
extern void dep_cleanup(void);
/* External Functions */
extern void catch_child(int);
extern void init_abort_jobs(int);
extern void scan_for_exiting(void);
#ifdef NAS /* localmod 015 */
extern int to_size(char *, struct size_value *);
#endif /* localmod 015 */
#ifndef WIN32
extern void scan_for_terminated(void);
/* Local public functions */
void stop_me(int);
#endif
extern void cleanup_hooks_workdir(struct work_task *);
/* Local private functions */
static char *mk_dirs(char *);
static void check_busy(double);
/**
* @brief
* 'windows' only function to check for an existence
* of directory.
*
* @param[in] path - directory path
* @param[in] id - char pointer holding which directory
*
* @retval 0 success
* @retval -1 failure
*
*/
#ifdef WIN32
static int
check_directory(char *path, char *id)
{
struct stat sb;
/*
* Do some minimal sanity checking, like
* does the cleaned_value exist?
* and is it a directory?
*/
if (stat(path, &sb) == -1) {
sprintf(log_buffer, "\"%s\" does not exist", path);
log_err(-1, id, log_buffer);
return -1;
}
if (!S_ISDIR(sb.st_mode)) {
sprintf(log_buffer, "\"%s\" is not a directory", path);
log_event(PBSEVENT_SYSTEM, 0, LOG_ERR, id, log_buffer);
return -1;
}
return 0;
}
#endif
/**
* @brief
* logs error message
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return NULL
*
*/
char *
nullproc(struct rm_attribute *attrib)
{
log_err(-1, __func__, "should not be called");
return NULL;
}
char *pbs_mach = NULL;
/**
* @brief
* gets machine architecture else logs error msg
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return string
* @retval PBS_ARCH Success
* @retval NULL Failure
*
*/
char *
arch(struct rm_attribute *attrib)
{
if (attrib) {
log_err(-1, __func__, extra_parm);
rm_errno = RM_ERR_BADPARAM;
return NULL;
}
if (pbs_mach != NULL)
return pbs_mach;
else
return PBS_MACH;
}
/**
* @brief
* returns username
*
* @return string
* @retval user name
*
*/
char *
getuname(void)
{
static char *name = NULL;
#ifndef WIN32
struct utsname n;
if (name == NULL) {
if (uname(&n) == -1)
return NULL;
sprintf(ret_string, "%s %s %s %s %s", n.sysname,
n.nodename, n.release, n.version, n.machine);
name = strdup(ret_string);
}
#else
OSVERSIONINFO osvi;
if (name == NULL) {
char hostname[PBS_MAXHOSTNAME+1];
if (gethostname(hostname, (sizeof(hostname) - 1)) != 0)
return NULL;
ZeroMemory(&osvi, sizeof(OSVERSIONINFO));
osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
GetVersionEx(&osvi);
sprintf(ret_string, "WIN2000 %s %d.%d %d i386",
hostname, osvi.dwMajorVersion, osvi.dwMinorVersion, osvi.dwBuildNumber);
name = strdup(ret_string);
}
#endif /* WIN32 */
return name;
}
/**
* @brief
* requsts username else logs error msg on failure
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return string
* @retval username Success
* @retval NULL Failure
*
*/
static char *
requname(struct rm_attribute *attrib)
{
char *cp;
if (attrib) {
log_err(-1, __func__, extra_parm);
rm_errno = RM_ERR_BADPARAM;
return NULL;
}
cp = getuname();
return cp;
}
/**
* @brief
* checks whether valid user
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return string
* @retval yes Success
* @retval no Failure
*
*/
static char *
validuser(struct rm_attribute *attrib)
{
struct passwd *p;
if (attrib == NULL || attrib->a_value == NULL) {
log_err(-1, __func__, no_parm);
rm_errno = RM_ERR_NOPARAM;
return NULL;
}
p = getpwnam(attrib->a_value);
if (p) {
return "yes";
} else {
return "no";
}
}
/**
* @brief
* returns the current load average on node
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return string
* @retval loadvalue Success
* @retval NULL Failure
*
*/
char *
loadave(struct rm_attribute *attrib)
{
static char ret_string[20];
double la;
if (attrib) {
log_err(-1, __func__, extra_parm);
rm_errno = RM_ERR_BADPARAM;
return NULL;
}
if (get_la(&la) != 0) {
rm_errno = RM_ERR_SYSTEM;
return NULL;
}
sprintf(ret_string, "%.2f", la);
return ret_string;
}
/**
* @brief
* Output the various resource lists.
*
* @param[in] attrib - pointer to rm_attribute structure
*
* @return string
* @retval log_buffer Success
* @retval NULL Failure
*
*/
char *
reslist(struct rm_attribute *attrib)
{
struct config *cp;
extern struct config common_config[];
extern struct config standard_config[];
extern struct config dependent_config[];
size_t len;
if (attrib) {
log_err(-1, __func__, extra_parm);
rm_errno = RM_ERR_BADPARAM;
return NULL;
}
log_buffer[0] = '\0';
for (cp=common_config; cp->c_name; cp++) {
strcat(log_buffer, cp->c_name);
strcat(log_buffer, " ");
}
for (cp=standard_config; cp->c_name; cp++) {
strcat(log_buffer, cp->c_name);
strcat(log_buffer, " ");
}
for (cp=dependent_config; cp->c_name; cp++) {
strcat(log_buffer, cp->c_name);
strcat(log_buffer, " ");
}
if (config_array) {
for (cp=config_array; cp->c_name; cp++) {
strcat(log_buffer, cp->c_name);
strcat(log_buffer, " ");
}
}
len = strlen(log_buffer);
if (len > 0) {
log_buffer[len-1] = '\0';
return log_buffer;
}
else
return NULL;
}
struct config common_config[] = {
{ "arch", { arch } },
{ "uname", { requname } },
{ "validuser", { validuser } },
{ "reslist", { reslist } },
{ NULL, { nullproc } }
};
/**
* @brief
* Search the array of resources read from the config files.
*
* @param[in] where - pointer to config structure
* @param[in] what - char pointer holding what to search
*
* @return structure handle
* @retval pointer to config structure Success
* @retval NULL Failure
*
*/
struct config *
rm_search(struct config *where, char *what)
{
struct config *cp;
if (where == NULL || what == NULL)
return NULL;
for (cp=where; cp->c_name; cp++) {
if (strcmp(cp->c_name, what) == 0)
break;
}
return (cp->c_name ? cp : NULL);
}
/**
* @brief
* Search the various resource lists.
*
* @param[in] res - string holding resource
* @param[in] attr - pointer to rm_attribute structure
*
* @return string
* @retval structure handler to config Success
* @retval NULL Failure
*
*/
char *
dependent(char *res, struct rm_attribute *attr)
{
struct config *ap;
extern struct config standard_config[];
extern struct config dependent_config[];
ap = rm_search(common_config, res);
if (ap)
return (ap->c_u.c_func(attr));
ap = rm_search(standard_config, res);
if (ap)
return (ap->c_u.c_func(attr));
ap = rm_search(dependent_config, res);
if (ap)
return (ap->c_u.c_func(attr));
rm_errno = RM_ERR_UNKNOWN;
return NULL;
}
/**
* @brief
* wrapper function to dep_cleanup
*
*/
void
cleanup(void)
{
dep_cleanup();
}
/**
* @brief
* Clean up after a signal.
*
* @param[in] sig - signal number
*
* @return Void
*
*/
void
die(int sig)
{
if (sig > 0) {
sprintf(log_buffer, "caught signal %d", sig);
log_event(PBSEVENT_SYSTEM, 0, LOG_NOTICE, __func__, log_buffer);
}
else
log_event(PBSEVENT_SYSTEM, 0, LOG_ALERT, __func__,
"abnormal termination");
cleanup();
log_close(1);
#ifdef WIN32
ExitThread(1);
#else
exit(1);
#endif
}
/**
* @brief
* Performs initialization steps like loading pbs.conf values,
* setting core limit size, running platform-specific initializations
* (e.g. cpusets initializations, topology data gathering),
* running the exechost_startup hook, and
* checking that there are no bad combinations of sharing values
* across the vnodes.
*
* @return void
*
*/
void
initialize(void)
{
unsigned int i;
int avl;
AVL_IX_DESC ix;
char hook_msg[HOOK_MSG_SIZE+1];
char hook_buf[HOOK_BUF_SIZE+1];
mom_hook_input_t hook_input;
mom_hook_output_t hook_output;
int hook_errcode = 0;
int hook_rc = 0;
hook *last_phook = NULL;
unsigned int hook_fail_action = 0;
int ret;
/*
* Each node of the AVL tree has a key, the hostname in this
* case. The default length of the key is defined in avltree.h,
* but then overridden here by the union definition of xxrp. The
* rp variable then points to this structure. When memory beyond
* the default lenfth of the AVL_IX_REC is accessed, it must be
* through xxrp or the compiler will complain about accessing
* memory beyond the size of the structure.
*
*/
union {
AVL_IX_REC xrp;
char buf[PBS_MAXHOSTNAME + sizeof(AVL_IX_REC) + 1];
} xxrp;
AVL_IX_REC *rp = &xxrp.xrp;
char none[] = "<unset>";
enum vnode_sharing hostval;
/* set limits that can be modified by the Admin */
#ifndef WIN32 /* ---- UNIX ------------------------------------------*/
#ifdef RLIMIT_CORE
int char_in_cname = 0;
(void)pbs_loadconf(0);
if (pbs_conf.pbs_core_limit) {
char *pc = pbs_conf.pbs_core_limit;
while (*pc != '\0') {
if (!isdigit(*pc)) {
/* there is a character in core limit */
char_in_cname = 1;
break;
}
pc++;
}
}
#if defined(RLIM64_INFINITY)
if (pbs_conf.pbs_core_limit) {
struct rlimit64 corelimit;
corelimit.rlim_max = RLIM64_INFINITY;
if (strcmp("unlimited", pbs_conf.pbs_core_limit) == 0)
corelimit.rlim_cur = RLIM64_INFINITY;
else if (char_in_cname == 1) {
log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_NODE, LOG_WARNING,
__func__, msg_corelimit);
corelimit.rlim_cur = RLIM64_INFINITY;
} else
corelimit.rlim_cur =
(rlim64_t)atol(pbs_conf.pbs_core_limit);
/* get system core limit */
(void)getrlimit64(RLIMIT_CORE, &orig_core_limit);
(void)setrlimit64(RLIMIT_CORE, &corelimit);
}
#else /* set rlimit 32 bit */
if (pbs_conf.pbs_core_limit) {
struct rlimit corelimit;
corelimit.rlim_max = RLIM_INFINITY;
if (strcmp("unlimited", pbs_conf.pbs_core_limit) == 0)
corelimit.rlim_cur = RLIM_INFINITY;
else if (char_in_cname == 1) {
log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_NODE, LOG_WARNING,
__func__, msg_corelimit);
corelimit.rlim_cur = RLIM_INFINITY;
} else
#ifdef _SX
corelimit.rlim_cur =
atol(pbs_conf.pbs_core_limit);
#else
corelimit.rlim_cur =
(rlim_t)atol(pbs_conf.pbs_core_limit);
#endif /* _SX */
/* get system core limit */
(void)getrlimit(RLIMIT_CORE, &orig_core_limit);
(void)setrlimit(RLIMIT_CORE, &corelimit);
}
#endif /* RLIM64_INFINITY */
#endif /* RLIMIT_CORE */
#endif /* !WIN32 ---------------------------------------------------------- */
num_pcpus = num_acpus = num_oscpus = 0;
dep_initialize();
if (num_oscpus == 0)
num_oscpus = num_pcpus;
sprintf(log_buffer, "pcpus=%d, OS reports %d cpu(s)",
num_pcpus, num_oscpus);
log_event(PBSEVENT_SYSTEM, 0, LOG_NOTICE, "initialize", log_buffer);
if (vnlp_from_hook == NULL) {
if (vnl_alloc(&vnlp_from_hook) == NULL) {
log_err(PBSE_SYSTEM, __func__, "vnl_alloc failed");
return;
}
vnlp_from_hook->vnl_modtime = time(NULL);
}
mom_hook_input_init(&hook_input);
hook_input.vnl = (vnl_t *)vnlp;
mom_hook_output_init(&hook_output);
hook_output.reject_errcode = &hook_errcode;
hook_output.last_phook = &last_phook;
hook_output.fail_action = &hook_fail_action;
hook_output.vnl = (vnl_t *)vnlp_from_hook;
if (setup_resc(1) != 0) {
/* log_buffer set in setup_resc */
log_err(-1, "setup_resc", "warning: failed to setup resourcdef");
}
switch ((hook_rc=mom_process_hooks(HOOK_EVENT_EXECHOST_STARTUP,
PBS_MOM_SERVICE_NAME,
mom_host, &hook_input, &hook_output, hook_msg,
sizeof(hook_msg), 0))) {
case 2: /* no hook script executed - go ahead and accept event */
break;
default:
/* a value of '0' means explicit reject encountered, and '1' means explicit accept. */
if ((hook_rc != 0) && (hook_rc != 1)) {
/* we've hit an internal error (malloc error, full disk, etc...), so */
/* treat this now like a hook error so hook fail_action */
/* will be consulted. */
/* Before, behavior of an internal error was to ignore it! */
hook_errcode = PBSE_HOOKERROR;
}
if (hook_errcode == PBSE_HOOKERROR) { /* error */
if ((last_phook != NULL) &&
(last_phook->fail_action & \
HOOK_FAIL_ACTION_OFFLINE_VNODES)) {
snprintf(hook_buf,
HOOK_BUF_SIZE+1,
"1,%s", last_phook->hook_name);
ret = vn_addvnr(vnlp_from_hook,
mom_short_name,
VNATTR_HOOK_OFFLINE_VNODES,
hook_buf, 0, 0, NULL);
if (ret != 0) {
snprintf(log_buffer,
sizeof(log_buffer),
"Failed to add to "
"vnlp_from_hook: %s=%s",
VNATTR_HOOK_OFFLINE_VNODES,
hook_buf);
log_event(PBSEVENT_DEBUG2,
PBS_EVENTCLASS_HOOK, LOG_INFO,
last_phook->hook_name,
log_buffer);
}
vnlp_from_hook->vnl_modtime = time(NULL);
}
break;
} else if (hook_fail_action & HOOK_FAIL_ACTION_CLEAR_VNODES) {
/* no hook error */
vnl_t *vnlp_tmp = NULL;
/* of vnlp_from_hook */
if (vnl_alloc(&vnlp_tmp) == NULL) {
log_err(PBSE_SYSTEM, __func__,
"vnl_alloc failed");
return;
}
ret = vn_addvnr(vnlp_tmp, mom_short_name,
VNATTR_HOOK_OFFLINE_VNODES, "0", 0,
0, NULL);
if (ret != 0) {
snprintf(log_buffer, sizeof(log_buffer),
"Failed to add to "
"vnlp_tmp: %s=%s",
VNATTR_HOOK_OFFLINE_VNODES,
hook_buf);
log_event(PBSEVENT_DEBUG2,
PBS_EVENTCLASS_HOOK, LOG_INFO,
last_phook->hook_name,
log_buffer);
vnl_free(vnlp_tmp);
vnlp_tmp = NULL;
return;
}
if (vnlp_from_hook->vnl_used > 0) {
/* the clear offline_vnodes action ,*/
/* as stored in 'vnlp_tmp' */
/* must appear before other vn */
/* actions (currently in
* vnlp_from_hook), since it would be */
/* clearing the states of all vnodes */
/* and their comments. vnlp_from_hook */
/* may contain vnode state and */
/* comment changes, and we would not */
/* want to override that. */
vn_merge(vnlp_tmp, vnlp_from_hook,
NULL);
vnl_free(vnlp_from_hook);
vnlp_from_hook = vnlp_tmp;
} else {
vn_merge(vnlp_from_hook, vnlp_tmp,
NULL);
vnl_free(vnlp_tmp);
}
vnlp_tmp = NULL;
}
}
mom_vnlp_report(vnlp_from_hook, "vnlp_from_hook");
if (vnlp_from_hook->vnl_used == 0) {
vnl_free(vnlp_from_hook);
vnlp_from_hook = NULL;
}
if (vnlp == NULL)
return;
/*
* Check that there are no bad combinations of sharing values
* across the vnodes.
*/
avl_create_index(&ix, AVL_NO_DUP_KEYS, 0);
for (i=0; i < vnlp->vnl_used; i++) {
vnal_t *vnrlp = VNL_NODENUM(vnlp, i);
char *host = attr_exist(vnrlp, "resources_available.host");
char *share;
char *exclhost = none;
enum vnode_sharing shareval;
if (host == NULL)
/* mom_host and mom_short_name are different!! */
/* use mom short name by default */
host = mom_short_name;
share = attr_exist(vnrlp, "sharing");
shareval = str_to_vnode_sharing(share);
if (shareval != VNS_UNSET)
exclhost = vnode_sharing_to_str(shareval);
/* search for host */
snprintf(rp->key, PBS_MAXHOSTNAME, "%s", host);
/* look to see if host has a sharing value saved */
avl = avl_find_key(rp, &ix);
if (avl != AVL_IX_OK) {
/*
* Not found so save the one we got.
*/
rp->recptr = exclhost;
if (avl_add_key(rp, &ix) != AVL_IX_OK) {
log_err(errno, __func__, "avl_add_key");
die(0);
}
continue;
}
/* the host exists, check if the saved value is the same */
if (rp->recptr == (void *)exclhost)
continue;
/* they are different, now check if it is a bad combo */
hostval = str_to_vnode_sharing(rp->recptr);
if (hostval == VNS_DFLT_EXCLHOST ||
hostval == VNS_FORCE_EXCLHOST ||
shareval == VNS_DFLT_EXCLHOST ||
shareval == VNS_FORCE_EXCLHOST) {
sprintf(log_buffer,
"It is erroneous to mix sharing=%s "
"for vnode %s with sharing=%s which "
"is set for other vnodes on host %s",
exclhost, vnrlp->vnal_id,
(char *)rp->recptr, host);
log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_NODE,
LOG_NOTICE, __func__, log_buffer);
die(0);
}
}
avl_destroy_index(&ix);
if (joinjob_alarm_time == -1)
joinjob_alarm_time = DEFAULT_JOINJOB_ALARM;
if (job_launch_delay == -1)
job_launch_delay = DEFAULT_JOB_LAUNCH_DELAY;
}
/**
* @brief
* Check for fatal memory allocation error.
*
* @param[in] buf - reallocated memory
*
* @return Void
*
*/
void
memcheck(char *buf)
{
if (buf)
return;
log_err(-1, "memcheck", "memory allocation failed");
die(0);
}
/**
* @brief
* Check the ret_string buffer to make sure that there is
* enough room starting at *spot to hold len characters more.
* If not, realloc the buffer and make *spot point to
* the corresponding place that it used to point to in
* the old buffer.
*
* @param[in] spot - buffer
* @param[in] len - buffer len
*
* @return Void
*
*/
void
checkret(char **spot, int len)
{
char *hold;
if ((*spot - ret_string) < (ret_size - len))
return;
ret_size += len*2; /* new buf size */
sprintf(log_buffer, "size increased to %d", ret_size);
log_event(PBSEVENT_SYSTEM, 0, LOG_DEBUG, __func__, log_buffer);
hold = realloc(ret_string, ret_size); /* new buf */
memcheck(hold);
*spot = *spot - ret_string + hold; /* new spot in buf */
ret_string = hold;
}
/**
* @brief
* skipwhite - process the string to make it blank free
*
* @param[in] str - string to be processed
*
* @return string
* @retval string with no blanks
*
*/
char *
skipwhite(char *str)
{
for (; *str; str++) {
if (!isspace(*str))
break;
}
return str;
}
/**
* @brief
* copies string in str to string tok
*
* @param[in] str - string to be copied
* @param[in] tok - destination string to be copied to
* @param[in] len - size of string str
*
* @return string
* @retval destination string "tok"
*
*/
char *
tokcpy(char *str, char *tok, size_t len)
{
size_t i;
for (i=0; *str && (i<len); str++, tok++, i++) {
#ifdef WIN32
if (!isalnum(*str) && *str != ':' && *str != '_' && *str != '\\')
#else
if (!isalnum(*str) && *str != ':' && *str != '_')
#endif
break;
*tok = *str;
}
*tok = '\0';
return str;
}
#define TOKCPY(a, b) tokcpy(a, b, sizeof(b))
/**
* @brief
* removes new line from str
*
* @param[in] str - string to be processed
*
* @return Void
*
*/
void
rmnl(char *str)
{
int i;
i = strlen(str);
while (--i) {
if ((*(str+i) != '\n') && !isspace((int)*(str+i)))
break;
*(str+i) = '\0';
}
}
/**
* @brief
* Similar to tokcpy() with only whitespace as the delimiting characters
*
* @param[in] str - string to be copied
* @param[in] tok - destination string to be copied to
* @param[in] len - size of string str
*
* @return string
* @retval processed destination string "tok"
*
*/
char *
wtokcpy(char *str, char *tok, int len)
{
int i;
for (i=0; *str && (i<len); str++, tok++, i++) {
if (isspace((int)*str))
break;
*tok = *str;
}
*tok = '\0';
return str;
}
#ifdef WIN32
/**
* @brief
* Similar to tokcpy() except that we respect any double quoted strings and
* we only accept a whitespace character that is outside
* of any enclosing double quotes as being a valid terminator.
*
* @param[in] str - string to be copied
* @param[in] tok - destination string to be copied to
* @param[in] len - size of string str
*
* @return string
* @retval processed destination string "tok"
*
*/
char *
qwtokcpy(char *str, char *tok, int len)
{
int i;
unsigned char oq = 0x0; /* open quote mark toggle */
for (i=0; *str && (i<len); str++, tok++, i++) {
if (isspace((int)*str) && !oq)
break;
if (*str == '"')
oq ^= 0x1;
*tok = *str;
}
*tok = '\0';
return str;
}
#endif /* WIN32 */
/**
* @brief
* malloc memory and make a copy of the path input string that does not
* contain any double quote marks
*
* @param[in] path - char pointer holding input path
*
* @return string
* @retval processed path string
*
*/
static char *
remove_quotes(char *path)
{
char *dp, *dup;
if (!path || !(dup = strdup(path)))
return NULL;
else
dp = dup;
do {
if (*path != '"')
*dp++ = *path;
} while (*path++) ;
return dup;
}
#ifdef WIN32
/**
* @Brief
* Check whether PBS_INTERACTIVE service is registered or not into Service Control Manager?
* if PBS_INTERACTIVE service is registered then open Service Control Handle for PBS_INTERACTIVE service and
* store into global variable schPbsInteractive and set the value of interactive_svc_avail variable to 1 and
* return success.
* if PBS_INTERACTIVE service is not registered in to Service Control Manager the set the value
* of interactive_svc_avail variable to -1 and return success.
* on any error set value of interactive_svc_avail variable to 0 and return failure.
*
* @return int
* @retval 0 - On Success
* @retval 1 - On Error
*
*/
int
check_pbs_interactive()
{
SC_HANDLE schPbsInteractiveMngr = INVALID_HANDLE_VALUE;
/* Open Service Control Handle for Service Control Manager */
schPbsInteractiveMngr = OpenSCManager(0, 0, SC_MANAGER_ALL_ACCESS);
if (schPbsInteractiveMngr == 0) {
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not open Service Control Manager");
log_err(-1, __func__, log_buffer);
return 1;
}
/* Trying to open Service Control Handle for PBS_INTERACTIVE service */
schPbsInteractive = OpenService(schPbsInteractiveMngr, g_PbsInteractiveName, SERVICE_ALL_ACCESS);
if (schPbsInteractive == 0) {
/* Check why openning of Service Control Handle for PBS_INTERACTIVE service is failed? */
if (GetLastError() == ERROR_SERVICE_DOES_NOT_EXIST) {
/* Service does not exist */
interactive_svc_avail = -1;
CloseServiceHandle(schPbsInteractiveMngr);
return 0;
} else {
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not open %s service", g_PbsInteractiveName);
log_err(-1, __func__, log_buffer);
CloseServiceHandle(schPbsInteractiveMngr);
return 1;
}
}
/* Service exists and service control handle successfully created */
interactive_svc_avail = 1;
CloseServiceHandle(schPbsInteractiveMngr);
return 0;
}
/**
* @brief
* Start PBS_INTERACTIVE service with one argument as full path of pbs_idled binary
*
* @return int
* @retval 0 - On Success
* @retval 1 - On Error
*
*/
int
start_pbs_interactive()
{
char *SvcArgv[1];
char ExeFile_path[MAX_PATH];
/* Check whether PBS_INTERACTIVE is registered of not? */
if (interactive_svc_avail == -1) {
/* PBS_INTERACTIVE service is not registered, log that and return success
* because here we assume that user want to use logon/logoff script
*/
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not find %s service, Continuing Cycle Harvesting with Logon/Logoff Script", g_PbsInteractiveName);
log_event(PBSEVENT_SYSTEM, 0, LOG_WARNING, g_PbsInteractiveName, log_buffer);
return 0;
} else if (interactive_svc_avail == 0) {
/* Error occured */
return 1;
}
/* Create full path of pbs_idled binary */
snprintf(ExeFile_path, MAX_PATH, "%s/%s", pbs_conf.pbs_exec_path, "bin/pbs_idled");
/* Initialize argument array for PBS_INTERACTIVE service */
SvcArgv[0] = ExeFile_path;
/* Start service, pass SvcArgv as arguments*/
if (!StartService(schPbsInteractive, 1, SvcArgv)) {
/* Failed to start PBS_INTERACTIVE service, return error */
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not start %s service", g_PbsInteractiveName);
log_err(-1, __func__, log_buffer);
return 1;
}
return 0;
}
/**
* @Brief
* Stop PBS_INTERACTIVE service
*
* @return void (Nothing)
*
*/
void
stop_pbs_interactive()
{
SERVICE_STATUS SvcSts;
char temp_path[MAX_PATH];
/* Check whether PBS_INTERACTIVE service is registered or not? */
if (interactive_svc_avail == -1) {
/* PBS_INTERACTIVE service is not registered, return success */
return;
} else if (interactive_svc_avail == 0) {
/* Error occured */
return;
}
/* PBS_INTERACTIVE service is registered */
/* Get information about PBS_INTERACTIVE service */
if (!QueryServiceStatus(schPbsInteractive, &SvcSts)) {
snprintf(log_buffer, LOG_BUF_SIZE-1, "1st: Can not get information about %s service", g_PbsInteractiveName);
log_err(-1, __func__, log_buffer);
CloseServiceHandle(schPbsInteractive);
return;
}
/* Check whether current status of service is RUNNING or not?
* if Yes, then stop service otherwise continue
*/
if (SvcSts.dwCurrentState == SERVICE_RUNNING) {
/* Service is RUNNING, Now stop service */
if (!ControlService(schPbsInteractive, SERVICE_CONTROL_STOP, &SvcSts)) {
/* Stopping of service is failed, return Error */
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not stop %s service", g_PbsInteractiveName);
log_err(-1, __func__, log_buffer);
CloseServiceHandle(schPbsInteractive);
return;
}
Sleep(SvcSts.dwWaitHint);
/* Get information about PBS_INTERACTIVE service again */
if (!QueryServiceStatus(schPbsInteractive, &SvcSts)) {
snprintf(log_buffer, LOG_BUF_SIZE-1, "2nd: Can not get information about %s service", g_PbsInteractiveName);
log_err(-1, __func__, log_buffer);
CloseServiceHandle(schPbsInteractive);
return;
}
/* Check whether stopping of service is success or not? */
if (SvcSts.dwCurrentState != SERVICE_STOPPED) {
/* Stopping of service is failed, return Error */
snprintf(log_buffer, LOG_BUF_SIZE-1, "%s service did not respond in timely fashion", g_PbsInteractiveName);
SetLastError(ERROR_SERVICE_REQUEST_TIMEOUT);
log_err(-1, __func__, log_buffer);
CloseServiceHandle(schPbsInteractive);
return;
}
}
/* Create full path of idle_poll_time file */
snprintf(temp_path, MAX_PATH, "%s/%s", pbs_conf.pbs_home_path, "spool/idle_poll_time");
/* Delete idle_poll_time file as it not needed any more */
DeleteFile(temp_path);
/* Create full path of idle_touch file */
snprintf(temp_path, MAX_PATH, "%s/%s", pbs_conf.pbs_home_path, "spool/idle_touch");
/* Delete idle_touch file as it not needed any more */
DeleteFile(temp_path);
/* Stopping of PBS_INTERACTIVE service is successful */
CloseServiceHandle(schPbsInteractive);
return;
}
/**
* @brief
* Create idle_touch and idle_poll_time file in PBS_HOME/spool directory
* assign read/write permission to everyone to both file
* write idle_poll time to idle_poll_time file
*
* @return int
* @retval 0 - On Success
* @retval 1 - On Error
*
*/
int
create_idle_files()
{
char idle_poll_buf[IDLE_POLL_BUFSIZE];
char WrittenBuf[IDLE_POLL_BUFSIZE];
HANDLE FileHandle = INVALID_HANDLE_VALUE;
char idle_touchFile[MAX_PATH]; /* To store full path of idle_touch file */
char idle_pollFile[MAX_PATH]; /* To store full path of idle_poll_time file */
/* Create full path of idle_poll_time file */
snprintf(idle_pollFile, MAX_PATH, "%s/%s", pbs_conf.pbs_home_path, "spool/idle_poll_time");
/* Create full path of idle_touch file */
snprintf(idle_touchFile, MAX_PATH, "%s/%s", pbs_conf.pbs_home_path, "spool/idle_touch");
/* Convert idle_poll time value into string */
_itoa_s(idle_poll, idle_poll_buf, BUF_SIZE, 10);
/* Create idle_poll_time file in PBS_HOME/spool directory */
FileHandle = CreateFile(idle_pollFile, GENERIC_ALL, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
if (FileHandle == INVALID_HANDLE_VALUE) {
/* idle_poll_time file creation failed */
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not create file %s", idle_pollFile);
log_err(-1, __func__, log_buffer);
return 1;
} else {
/* idle_poll_time file created */
/* secure idle_poll_time file for everyone with read/write permission */
secure_file(idle_pollFile, "Everyone", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
/* idle_poll_time file is secured, write idle_poll time into idle_poll_time file */
if (!WriteFile(FileHandle, idle_poll_buf, strlen(idle_poll_buf), (LPDWORD)&WrittenBuf, NULL)) {
/* Writing idle_poll time into idle_poll_time file failed */
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not write idle_poll time into %s file", idle_pollFile);
log_err(-1, __func__, log_buffer);
CloseHandle(FileHandle);
return 1;
}
}
/* idle_poll_time file created, secured and idle_poll time written sucessfully */
if (FileHandle)
CloseHandle(FileHandle);
/* Create idle_touch file in PBS_HOME/spool directory */
FileHandle = CreateFile(idle_touchFile, GENERIC_ALL, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
if (FileHandle == INVALID_HANDLE_VALUE) {
/* idle_touch file creation failed */
snprintf(log_buffer, LOG_BUF_SIZE-1, "Can not create file %s", idle_touchFile);
log_err(-1, __func__, log_buffer);
return 1;
} else {
/* idle_touch file created */
/* secure idle_touch file for everyone with read/write permission */
secure_file(idle_touchFile, "Everyone", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
}
/* idle_touch file created and secured sucessfully */
if (FileHandle)
CloseHandle(FileHandle);
return 0;
}
#endif
/**
* @brief
* add_mom_action - Parse mom action command from mom config file and add
* into mom_action array
*
* @param[in] str - line from mom config file which contain action
* command for mom
*
* @return handler_ret_t
* @retval HANDLER_FAIL - on fail
* @retval HANDLER_SUCCESS - on success
*
*/
static handler_ret_t
add_mom_action(char *str)
{
char arg[_POSIX_PATH_MAX+1];
int i;
int count;
char *pc;
int na;
char **pargs;
char *scp;
int tout;
int white;
if (*str == '\0')
return HANDLER_FAIL;
/* first token is name of event */
str = TOKCPY(str, arg);
str = skipwhite(str);
if (*str == '\0')
return HANDLER_FAIL;
for (na = 0; na < (int)LastAction; na++) {
if (strcmp(arg, mom_action[na].ma_name) == 0) {
/* have a valid event name */
break;
}
}
if (na >= (int)LastAction)
return HANDLER_FAIL;
/* next should come the time out value */
str = TOKCPY(str, arg);
str = skipwhite(str);
if (*str == '\0')
return HANDLER_FAIL;
if (!isdigit((int)*arg))
return HANDLER_FAIL;
tout = atoi(arg);
/* next is the action verb: a script or some keyword */
if (*str == '!') {
/* script specified */
#ifdef WIN32
str = qwtokcpy(++str, arg, _POSIX_PATH_MAX);
#else
str = wtokcpy(++str, arg, _POSIX_PATH_MAX);
#endif
str = skipwhite(str);
#ifdef WIN32
if (is_full_path(arg)) {
scp = malloc(strlen(arg) + 1);
if (scp == NULL) {
return HANDLER_FAIL;
}
strcpy(scp, arg);
} else {
/* convert relative path to an absolute */
/* path based on PBS_HOME/mom_priv */
scp = malloc(strlen(arg) + strlen(mom_home) + 2);
if (scp == NULL) {
return HANDLER_FAIL;
}
strcpy(scp, mom_home);
strcat(scp, "/");
strcat(scp, arg);
}
#else
if (*arg != '/') {
/* need to make relative path absolute to */
/* PBS_HOME/mom_priv */
scp = malloc(strlen(arg) + strlen(mom_home) + 2);
if (scp == NULL) {
return HANDLER_FAIL;
}
strcpy(scp, mom_home);
strcat(scp, "/");
strcat(scp, arg);
} else {
scp = malloc(strlen(arg) + 1);
if (scp == NULL) {
return HANDLER_FAIL;
}
strcpy(scp, arg);
}
#endif /* WIN32 */
/* now count up the number of args */
white = -1;
count = 0;
pargs = 0;
pc = str;
while (*pc) {
if (isspace((int)*pc)) {
if (white != 1)
white = 1;
} else {
if (white != 0) {
white = 0;
count++;
}
}
pc++;
}
pargs = (char **)malloc((count+1) * sizeof(char *));
if (pargs == NULL) {
free(scp);
return HANDLER_FAIL;
}
/* now we know how many and have space, copy each arg */
for (i = 0; i < count; i++) {
str = wtokcpy(str, arg, _POSIX_PATH_MAX);
str = skipwhite(str);
if ((*(pargs + i) = strdup(arg)) == NULL) {
for ( ; i >= 0; i--) {
free(*(pargs + i));
}
free(scp);
free(pargs);
return HANDLER_FAIL;
}
}
*(pargs + i) = NULL;
/* now we can set the action array member */
mom_action[na].ma_verb = Script;
mom_action[na].ma_timeout = tout;
mom_action[na].ma_script = scp;
mom_action[na].ma_args = pargs;
goto done;
}
/* not a script, must be a recognized verb */
if (strcmp(arg, "requeue") == 0) {
/* Requeue Verb */
mom_action[na].ma_verb = Requeue;
mom_action[na].ma_timeout = tout;
mom_action[na].ma_script = NULL;
mom_action[na].ma_args = NULL;
} else
return HANDLER_FAIL; /* error */
done:
if (mom_action[na].ma_verb == Script)
sprintf(log_buffer, "%s: %s", mom_action[na].ma_name,
mom_action[na].ma_script);
else
sprintf(log_buffer, "%s: %s", mom_action[na].ma_name, arg);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
"action", log_buffer);
return HANDLER_SUCCESS;
}
/**
* @brief
* adds client by name.
*
* @param[in] name - name of host
*
* @return u_long
* @retval 0 Failure
* @retval ipaddr of host Success
*
*/
static u_long
addclient_byname(char *name)
{
struct hostent *host;
struct in_addr saddr;
u_long ipaddr = 0;
int i;
if ((host = gethostbyname(name)) == NULL) {
sprintf(log_buffer, "host %s not found", name);
log_err(-1, __func__, log_buffer);
return 0;
}
for (i=0; host->h_addr_list[i]; i++) {
memcpy((char *)&saddr, host->h_addr_list[i], host->h_length);
ipaddr = ntohl(saddr.s_addr);
addrinsert(ipaddr);
}
return ipaddr;
}
/**
* @brief
* wrapper func for addclient_byname.
*
* @param[in] name - name of host
*
* @return handler_ret_t (return value)
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS(1) Success
*
*/
static handler_ret_t
addclient(char *name)
{
if (addclient_byname(name) == 0)
return HANDLER_FAIL;
else
return HANDLER_SUCCESS;
}
#if MOM_BGL
/**
* @brief
* sets reserve partitions.
*
* @param[in] part_list - partition list
*
* @return handler_ret_t (return value)
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS(1) Success
*
*/
static handler_ret_t
set_bgl_reserve_partitions(char *part_list)
{
reserve_bglpartitions = strdup(part_list);
if (reserve_bglpartitions) {
sprintf(log_buffer, "bgl_reserve_partitions %s", part_list);
log_event(PBSEVENT_SYSTEM, 0, LOG_DEBUG, __func__, log_buffer);
return HANDLER_SUCCESS;
} else {
log_err(errno, __func__, "strdup failed");
return HANDLER_FAIL;
}
}
#endif /* MOM_BGL */
/**
* @brief
* sets the log event
*
* @param[in] value - log value
*
* @return handler_ret_t (return value)
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS(1) Success
*
*/
static handler_ret_t
setlogevent(char *value)
{
char *bad;
* log_event_mask = strtol(value, &bad, 0);
tpp_set_logmask(*log_event_mask);
if ((*bad == '\0') || isspace((int)*bad))
return HANDLER_SUCCESS;
else
return HANDLER_FAIL;
}
/**
* @brief
* Set the configuration flag that defines whether the hook files/scripts
* or job scripts to be run under root are rejected by mom.
*
* @param[in] value - log value
*
* @retval 0 failure
* @retval 1 success
*
*/
static handler_ret_t
set_reject_root_scripts(char *value)
{
return (set_boolean(__func__, value, &reject_root_scripts));
}
/**
* @brief
* Set the configuration flag that tells the mom to send the checksums
* of the hooks it knows about.
*
* @param[in] value - log value
*
* @retval 0 failure
* @retval 1 success
*
*/
static handler_ret_t
set_report_hook_checksums(char *value)
{
return (set_boolean(__func__, value, &report_hook_checksums));
}
/**
* @brief
* sets log event if host is restricted.
*
* @param[in] name - name of host
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
restricted(char *name)
{
int i;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG, __func__, name);
if (mask_max == 0) {
maskclient = (char **)calloc(4, sizeof(char *));
if (maskclient == NULL)
return HANDLER_FAIL; /* error */
mask_max = 4;
}
if ((maskclient[mask_num] = strdup(name)) == NULL) {
for (i=0; i<mask_num; i++)
free(maskclient[i]);
mask_num = 0;
return HANDLER_FAIL;
}
if (maskclient[mask_num++] == NULL) {
for (i=0; i<mask_num; i++)
free(maskclient[i]);
mask_num = 0;
return HANDLER_FAIL; /* error */
}
if (mask_num == mask_max) {
char **tmcl;
tmcl = (char **)realloc(maskclient,
2*mask_max*sizeof(char *));
if (tmcl == NULL)
return HANDLER_FAIL; /* error */
maskclient = tmcl;
mask_max *= 2;
}
return HANDLER_SUCCESS;
}
/**
* @brief
* sets the cputfactor value
*
* @param[in] value - value for cputfactor
*
* @return handler_ret_t (return value)
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS(1) Success
*
*/
static handler_ret_t
cputmult(char *value)
{
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG, __func__, value);
if ((cputfactor = atof(value)) == 0.0)
return HANDLER_FAIL; /* error */
return HANDLER_SUCCESS;
}
#if defined(MOM_CPUSET) && (CPUSET_VERSION >= 4)
/**
* @brief
* set the action to take when encountering
* CPU set errors. value may be one of
*
* "continue" to log the errors and proceed normally
*
* "offline" in response to an error, the job's vnodes
* on this host will be marked offline;
* this is the default action
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_cpuset_error_action(char *value)
{
char tok[80];
char *action;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG, __func__, value);
if ((value == 0) || (*value == '\0') || (strlen(value) >= sizeof(tok)))
return HANDLER_FAIL;
(void) TOKCPY(value, tok);
if (!strcmp(tok, "continue") || !strcmp(tok, "offline")) {
action = strdup(tok);
if (action != NULL) {
cpuset_error_action = action;
return HANDLER_SUCCESS;
}
}
return HANDLER_FAIL;
}
#endif /* MOM_CPUSET && CPUSET_VERSION >= 4 */
/**
* @brief
* sets wallfactor
*
* @param[in] value - value for wallfactor
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
wallmult(char *value)
{
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG, __func__, value);
if ((wallfactor = atof(value)) == 0.0)
return HANDLER_FAIL; /* error */
return HANDLER_SUCCESS;
}
/**
* @brief
* sets hosts
*
* @param[in] value - value for hosts
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
usecp(char *value)
{
char *pnxt;
static int cphosts_max = 0;
if (cphosts_max == 0) {
pcphosts = malloc(2 * sizeof(struct cphosts));
if (pcphosts == NULL) {
return HANDLER_FAIL;
}
cphosts_max = 2;
} else if (cphosts_max == cphosts_num) {
struct cphosts *tmppcphosts;
tmppcphosts = realloc(pcphosts,
(cphosts_max + 2) * sizeof(struct cphosts));
if (tmppcphosts == NULL) {
free(pcphosts);
return HANDLER_FAIL;
}
pcphosts = tmppcphosts;
cphosts_max += 2;
}
pnxt = strchr(value, (int)':');
if (pnxt == NULL) {
sprintf(log_buffer, "invalid host specification: %s", value);
log_err(-1, __func__, log_buffer);
return HANDLER_FAIL;
}
*pnxt++ = '\0';
#ifdef NAS /* localmod 009 */
/* support $usecp rules that exclude a pattern, look for hostname
* that starts with ! */
if (value[0] == '!') {
(pcphosts+cphosts_num)->cph_exclude = 1;
value++;
}
else {
(pcphosts+cphosts_num)->cph_exclude = 0;
}
#endif /* localmod 009 */
if (((pcphosts+cphosts_num)->cph_hosts = strdup(value)) == NULL)
return HANDLER_FAIL;
value = pnxt; /* now ptr to path */
while (!isspace(*pnxt))
pnxt++;
*pnxt++ = '\0';
if (((pcphosts+cphosts_num)->cph_from = strdup(value)) == NULL)
return HANDLER_FAIL;
if (((pcphosts+cphosts_num)->cph_to = strdup(skipwhite(pnxt))) == NULL)
return HANDLER_FAIL;
#ifdef WIN32
back2forward_slash((pcphosts+cphosts_num)->cph_from);
back2forward_slash((pcphosts+cphosts_num)->cph_to);
#endif
cphosts_num++;
return HANDLER_SUCCESS;
}
/**
* @brief
* sets prolog alarm
*
* @param[in] value - value for prolog alarm
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
prologalarm(char *value)
{
int i;
extern unsigned int pe_alarm_time;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
"prolog alarm", value);
i = (unsigned int)atoi(value);
if (i <= 0)
return HANDLER_FAIL; /* error */
pe_alarm_time = (unsigned int)i;
return HANDLER_SUCCESS;
}
/**
* @brief
* Handler function for the $sister_join_job_alarm config option.
*
* @param[in] value - the input given in config file.
*
* @return handler_ret_t
* @retval HANNDLER_SUCCESS
* @retval HANDLER_FAIL
*/
static handler_ret_t
set_joinjob_alarm(char *value)
{
long i;
char *endp;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
"sister_join_job_alarm", value);
i = strtol(value, &endp, 10);
if ((*endp != '\0') || (i <= 0) || (i == LONG_MIN) || (i == LONG_MAX))
return HANDLER_FAIL; /* error */
joinjob_alarm_time = i;
return HANDLER_SUCCESS;
}
/**
* @brief
* Handler function for the $job_launch_delay cconfig option.
*
* @param[in] value - the input given in config file.
*
* @return handler_ret_t
* @retval HANNDLER_SUCCESS
* @retval HANDLER_FAIL
*/
static handler_ret_t
set_job_launch_delay(char *value)
{
long i;
char *endp;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
"job_launch_delay", value);
i = strtol(value, &endp, 10);
if ((*endp != '\0') || (i <= 0) || (i == LONG_MIN) || (i == LONG_MAX))
return HANDLER_FAIL; /* error */
job_launch_delay = i;
return HANDLER_SUCCESS;
}
#ifdef WIN32
/**
* @brief
* sets nrun_factor
*
* @param[in] value - value for nrun_factor
*
* @return handler_ret_t
* @retval HANDLER_FAIL(0) Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_nrun_factor(char *value)
{
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG, __func__, value);
if ((nrun_factor = atoi(value)) == 0)
return HANDLER_FAIL; /* error */
else
return HANDLER_SUCCESS;
}
/**
* @brief
* Performs shell_escape_timeout on process tree by using given parent process handle <hProcess> and close that handle.
*
* @return Void
*
*/
static HANDLE shell_escape_handle = INVALID_HANDLE_VALUE;
static void
shell_escape_timeout(void)
{
int ret = 0;
int err_no = 0;
char log_buf[LOG_BUF_SIZE] = "";
if (shell_escape_handle != INVALID_HANDLE_VALUE) {
ret = processtree_op_by_handle(shell_escape_handle, TERMINATE, 13);/* Terminated process would have exit code 13 */
if (ret == -1) {
err_no = GetLastError();
sprintf(log_buf, "could not terminate shell escape process tree, pid=%d", GetProcessId(shell_escape_handle));
log_err(err_no, "shell_escape_timeout", log_buf);
} else {
SetLastError(0);
}
log_err(-1, "shell_escape_timeout", "terminate shell escape");
}
}
#endif /* WIN32 */
/**
* @brief
* read and set values used in enforcement of cpupercent calculation and
* other limit enforcement
*
* In the form of:
* $enforce NAME VALUE
*
* where "NAME is default range of values */
int delta_percent_over = 50; /* 0 <= I <= 100 */
double delta_cpufactor = 1.05; /* 1.0 <= D */
double delta_weightup = 0.4; /* 0.0 <= D <= 1.0 */
double delta_weightdown = 0.1; /* 0.0 <= D <= 1.0 */
int average_percent_over = 50; /* 0 <= I <= 100 */
double average_cpufactor = 1.025; /* 1.0 <= D */
int average_trialperiod = 120; /* 0 <= I */
/*
* or the form of: $enforce [!]NAME
* where NAME is:
*/
/* cpuburst */
int cpuburst = 0; /* 1 or 0 */
/* cpuaverage */
int cpuaverage = 0; /* 1 or 0 */
/* mem */
int enforce_mem = 0; /* on, value ignored */
/* complexmem */
int complex_mem_calc = 0; /* 1 or 0 */
static handler_ret_t
set_enforcement(char *str)
{
char arg[80];
int on = 1;
if (!str)
return HANDLER_FAIL;
/* if current token starts with !, then set value off and skip ! */
if (*str == '!') {
on = 0; /* set off */
str++;
}
str = TOKCPY(str, arg);
str = skipwhite(str);
if (strcmp(arg, "delta_percent_over") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
delta_percent_over = atoi(str);
} else if (strcmp(arg, "delta_cpufactor") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
delta_cpufactor = atof(str);
} else if (strcmp(arg, "delta_weightup") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
delta_weightup = atof(str);
} else if (strcmp(arg, "delta_weightdown") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
delta_weightdown = atof(str);
} else if (strcmp(arg, "average_percent_over") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
average_percent_over = atoi(str);
} else if (strcmp(arg, "average_cpufactor") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
average_cpufactor = atof(str);
} else if (strcmp(arg, "average_trialperiod") == 0) {
if (*str == '\0')
return HANDLER_FAIL;
average_trialperiod = atoi(str);
} else if (strcmp(arg, "cpuburst") == 0) {
cpuburst = on; /* may be off */
} else if (strcmp(arg, "cpuaverage") == 0) {
cpuaverage = on; /* may be off */
} else if (strcmp(arg, "mem") == 0) {
enforce_mem = on; /* may be off */
} else if (strcmp(arg, "complexmem") == 0) {
complex_mem_calc = on; /* may be off */
} else {
return HANDLER_FAIL;
}
return HANDLER_SUCCESS;
}
/**
* @brief
* check for the type of action to be done on a certain event
*
* @param[in] ae - enum val for action_event
*
* @return the Action_Verb enum value, see mom_func.h:
* @retval Default no directive to change the action for the event
* @retval Script defined in see mom_func.h
* @retval Requeue defined in see mom_func.h
*
*/
enum Action_Verb
chk_mom_action(enum Action_Event ae)
{
assert((0 <= ae) && (ae < (int)LastAction));
return mom_action[ae].ma_verb;
}
#define PBS_MAX_ACTION_ARGS 2000
/**
* @brief
* if there is an external script defined for this
* action, do it and return values:
*
* @retval 1 script running in child process
* @retval 0 script ran with no error
* @retval -1 error, script did not run correctly
* @retval -2 error, no script - do normal default action
*
* The "post" function is called out of scan_for_terminated() when the
* child process (script) exits. It is called with the pointer to the job
* and the script exit value. If the script does not complete in the
* specified timeout value, the "post" function will be called with the
* error value of -1.
*
* The action taken by the "post" function on a error depends on the
* function itself. Usually it should preform the "default" action
* for that action.
*
*/
int
do_mom_action_script(int ae, /* index into action table */
job *pjob, /* ptr to job */
pbs_task *ptask, /* ptr to task */
char *path,
void (*post)(job *p, int e)) /* post action func */
{
char **args = NULL;
#ifdef WIN32
char buf[MAX_PATH + 1];
#else
char buf[MAXPATHLEN + 1];
#endif
int i;
int nargs;
char **pargs;
struct stat sb;
struct passwd *pwdp;
int rc = -1;
struct mom_action *ma;
int transmog = 0;
#ifdef WIN32
char *pnoq = 0;
char cmd_line[4096];
int flags = CREATE_DEFAULT_ERROR_MODE|CREATE_NEW_CONSOLE|
CREATE_NEW_PROCESS_GROUP;
STARTUPINFO si = { 0 };
PROCESS_INFORMATION pi = { 0 };
char *env_block;
char *shell;
extern char *variables_else[];
DWORD ret;
HANDLE hjob;
#else
int j;
int pipes[2], kid_read = -1, kid_write = -1;
int parent_read = -1, parent_write = -1;
struct startjob_rtn sjr;
pid_t child;
memset(&sjr, 0, sizeof(sjr));
#endif /* WIN32 */
assert((0 <= ae) && (ae < (int)LastAction));
ma = &mom_action[ae];
if ((ma == NULL) || (ma->ma_script == NULL))
return -2;
/* does script really exist? */
#ifdef WIN32
pnoq = remove_quotes(ma->ma_script);
if (stat(pnoq, &sb) == -1) {
sprintf(log_buffer, "action %s script %s does not exist",
ma->ma_name, ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
if (pnoq)
free(pnoq);
return -1;
} else if (pnoq && chk_file_sec(pnoq, 0, 0, WRITES_MASK^FILE_WRITE_EA, 0)) {
sprintf(log_buffer, "action %s script %s cannot be executed "
"due to permissions", ma->ma_name, ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
if (pnoq)
free(pnoq);
return -1;
}
if (pnoq)
free(pnoq);
#else
if (stat(ma->ma_script, &sb) == -1) {
sprintf(log_buffer, "action %s script %s does not exist",
ma->ma_name, ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
return -1;
} else if ((sb.st_uid != 0) || (sb.st_gid > 10) ||
((sb.st_mode & S_IXUSR) != S_IXUSR) ||
((sb.st_mode & S_IWOTH) != 0)) {
sprintf(log_buffer, "action %s script %s cannot be executed "
"due to permissions", ma->ma_name, ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
return -1;
}
#endif /* WIN32 */
if (ptask == NULL)
ptask = (pbs_task *)GET_NEXT(pjob->ji_tasks);
if (ptask == NULL) {
sprintf(log_buffer, "action %s script %s cannot run because"
" job has no tasks", ma->ma_name, ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
return -1;
}
/*
** If we are going to leave the script running in the background,
** the ji_momsubt field has to be free to track the pid.
*/
if (post != NULL && pjob->ji_momsubt) {
sprintf(log_buffer, "action %s script %s cannot be run"
" due to existing subtask", ma->ma_name,
ma->ma_script);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
return -1;
}
if ((pwdp = check_pwd(pjob)) == NULL) {
log_event(PBSEVENT_JOB | PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB,
LOG_ERR, pjob->ji_qs.ji_jobid, log_buffer);
return -1;
}
/* build up args to script */
for (nargs = 0, pargs = ma->ma_args; pargs && *pargs; pargs++)
nargs++;
/* Add one for the command itself */
nargs++;
args = calloc((nargs + 1), sizeof(char *));
if (args == NULL)
return -1;
#ifdef WIN32
strcpy(cmd_line, "cmd /q /c ");
strcat(cmd_line, ma->ma_script);
#endif
/* set args[0] to script */
args[0] = strdup(ma->ma_script);
if (args[0] == NULL) {
free(args);
return -1;
}
pargs = ma->ma_args;
for (i = 1; i < nargs; i++, pargs++) {
if (**pargs == '%') {
if (strcmp(*pargs + 1, "jobid") == 0) {
(void)strcpy(buf, pjob->ji_qs.ji_jobid);
} else if (strcmp(*pargs + 1, "sid") == 0) {
sprintf(buf, "%d", ptask->ti_qs.ti_sid);
} else if (strcmp(*pargs + 1, "taskid") == 0) {
sprintf(buf, "%d", ptask->ti_qs.ti_task);
} else if (strcmp(*pargs + 1, "uid") == 0) {
#ifdef WIN32
sprintf(buf, "%ld", pjob->ji_qs.ji_un.ji_momt.ji_exuid);
#else
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exuid);
#endif
} else if (strcmp(*pargs + 1, "gid") == 0) {
#ifdef WIN32
sprintf(buf, "%ld", pjob->ji_qs.ji_un.ji_momt.ji_exgid);
#else
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exgid);
#endif
} else if (strcmp(*pargs + 1, "login") == 0) {
strcpy(buf,
pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str);
} else if (strcmp(*pargs + 1, "owner") == 0) {
strcpy(buf,
pjob->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str);
} else if (strcmp(*pargs + 1, "globid") == 0) {
strcpy(buf, "NULL");
} else if (strcmp(*pargs + 1, "auxid") == 0) {
if (pjob->ji_wattr[(int)JOB_ATR_altid].at_val.at_str) {
strncpy(buf, pjob->ji_wattr[(int)JOB_ATR_altid].
at_val.at_str, sizeof(buf)-1);
buf[sizeof(buf)-1] = '\0';
} else
strcpy(buf, "NULL");
} else if (strcmp(*pargs + 1, "path") == 0) {
if (path != NULL) {
strncpy(buf, path, sizeof(buf)-1);
buf[sizeof(buf)-1] = '\0';
} else
strcpy(buf, "NULL");
} else {
sprintf(log_buffer, "action %s script %s cannot be run"
" due to unknown parameter %s",
ma->ma_name, ma->ma_script,
*pargs);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
goto done;
}
} else {
(void)strcpy(buf, *pargs);
}
*(args + i) = strdup(buf);
if (*(args + i) == NULL)
return -1;
#ifdef WIN32
strcat(cmd_line, " ");
strcat(cmd_line, buf);
#endif
}
#ifdef WIN32
shell = set_shell(pjob, pwdp); /* machine dependent */
init_envp();
/*
** Setup environment
*/
/* UID */
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exuid);
bld_wenv_variables("UID", buf);
/* GID */
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exgid);
bld_wenv_variables("GID", buf);
/* HOME */
bld_wenv_variables(variables_else[0], pwdp->pw_dir);
/* LOGNAME */
bld_wenv_variables(variables_else[1], pwdp->pw_name);
/* PBS_JOBNAME */
bld_wenv_variables(variables_else[2],
pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);
/* PBS_JOBID */
bld_wenv_variables(variables_else[3], pjob->ji_qs.ji_jobid);
/* PBS_QUEUE */
bld_wenv_variables(variables_else[4],
pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str);
/* SHELL */
bld_wenv_variables(variables_else[5], shell);
/* USER */
bld_wenv_variables(variables_else[6], pwdp->pw_name);
/* PBS_JOBCOOKIE */
bld_wenv_variables(variables_else[7],
pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str);
/* PBS_NODENUM */
sprintf(buf, "%d", pjob->ji_nodeid);
bld_wenv_variables(variables_else[8], buf);
/* PBS_TASKNUM */
sprintf(buf, "%ld", (long)ptask->ti_qs.ti_task);
bld_wenv_variables(variables_else[9], buf);
/* PBS_MOMPORT */
sprintf(buf, "%d", pbs_rm_port);
bld_wenv_variables(variables_else[10], buf);
/* PBS_NODEFILE */
sprintf(buf, "%s/aux/%s", pbs_conf.pbs_home_path,
pjob->ji_qs.ji_jobid);
bld_wenv_variables(variables_else[11], buf);
/* PBS_SID */
sprintf(buf, "%ld", ptask->ti_qs.ti_sid);
bld_wenv_variables("PBS_SID", buf);
/* PBS_JOBDIR */
if ((pjob->ji_wattr[(int)JOB_ATR_sandbox].at_flags & ATR_VFLAG_SET) &&
(strcasecmp(pjob->ji_wattr[JOB_ATR_sandbox].at_val.at_str, "PRIVATE") == 0)) {
bld_wenv_variables("PBS_JOBDIR", jobdirname(pjob->ji_qs.ji_jobid, pjob->ji_grpcache->gc_homedir));
} else {
bld_wenv_variables("PBS_JOBDIR", pjob->ji_grpcache->gc_homedir);
}
/* USERPROFILE */
bld_wenv_variables(variables_else[16],
default_local_homedir(pwdp->pw_name,
pwdp->pw_userlogin, 1));
/* USERNAME */
bld_wenv_variables(variables_else[17], pwdp->pw_name);
/*
** Special case for restart_transmogrify.
*/
if ((transmog = (ae == RestartAction) && restart_transmogrify)) {
if (pjob->ji_hJob == INVALID_HANDLE_VALUE) {
hjob = CreateJobObject(NULL, pjob->ji_qs.ji_jobid);
if (hjob == NULL) {
log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
LOG_WARNING, pjob->ji_qs.ji_jobid,
"Unable to create job");
goto done;
}
pjob->ji_hJob = hjob;
}
if ((i = mom_set_limits(pjob, SET_LIMIT_SET)) != PBSE_NONE) {
sprintf(log_buffer, "Unable to set limits, err=%d", i);
log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
LOG_WARNING, pjob->ji_qs.ji_jobid, log_buffer);
goto done;
}
flags |= CREATE_SUSPENDED;
}
env_block = make_envp();
si.cb = sizeof(si);
si.lpDesktop = "";
ret = CreateProcess(NULL, cmd_line,
NULL, NULL, TRUE, flags,
env_block, NULL, &si, &pi);
/* could be sitting on a user's network directory (checkpoint action) */
if ((ret == 0) && (GetLastError() == ERROR_ACCESS_DENIED)) {
char current_dir[MAX_PATH+1];
char *temp_dir = NULL;
current_dir[0] = '\0';
_getcwd(current_dir, MAX_PATH+1);
temp_dir = get_saved_env("SYSTEMROOT");
chdir(temp_dir?temp_dir:"C:\\");
ret = CreateProcess(NULL, cmd_line,
NULL, NULL, TRUE, flags,
env_block, NULL, &si, &pi);
/* restore current working directory */
chdir(current_dir);
}
if (env_block)
free(env_block);
if (ret == 0) {
sprintf(log_buffer, "action script %s cannot be run due to"
" CreateProcess failure %d", ma->ma_script,
WSAGetLastError());
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
goto done;
}
if (post != NULL) { /* post func means we do not wait */
/* hook action process to pjob so that when pjob gets killed */
/* all action processes will get cleared */
if (pjob->ji_hJob != INVALID_HANDLE_VALUE)
AssignProcessToJobObject(pjob->ji_hJob, pi.hProcess);
rc = 1;
pjob->ji_momsubt = pi.hProcess;
pjob->ji_mompost = post;
if (ma->ma_timeout)
pjob->ji_actalarm = time_now + ma->ma_timeout;
else
pjob->ji_actalarm = 0;
goto done;
}
if (transmog) { /* setup new task */
/*
* call to daemon_protect() to unprotect not needed
* for Windows as the created process does not inherit
* the protection value
*/
ret = AssignProcessToJobObject(pjob->ji_hJob, pi.hProcess);
if (!ret) {
log_err(-1, __func__, "AssignProcessToJobObject");
CloseHandle(pi.hThread);
if (processtree_op_by_handle(pi.hProcess, TERMINATE, 13) == -1)
log_err(-1, __func__, "processtree_op_by_handle failed with TERMINATE");
CloseHandle(pi.hProcess);
goto done;
}
/*
** After adding process to job
*/
ptask->ti_hProc = pi.hProcess;
ptask->ti_qs.ti_sid = pi.dwProcessId;
ptask->ti_qs.ti_status = TI_STATE_RUNNING;
(void)task_save(ptask);
/* update the job with the new session id */
pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long
= ptask->ti_qs.ti_sid;
pjob->ji_wattr[(int)JOB_ATR_session_id].at_flags =
ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
pjob->ji_qs.ji_state = JOB_STATE_RUNNING;
pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
job_save(pjob, SAVEJOB_QUICK);
}
(void)sprintf(log_buffer, "task transmogrified, %s", cmd_line);
log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
ret = ResumeThread(pi.hThread);
if (ret == -1) {
log_err(-1, __func__, "ResumeThread");
CloseHandle(pi.hThread);
if (processtree_op_by_handle(pi.hProcess, TERMINATE, 13) == -1)
log_err(-1, __func__, "processtree_op_by_handle failed with TERMINATE");
goto done;
}
CloseHandle(pi.hThread);
/* send updated session id to server */
update_ajob_status(pjob);
rc = 0;
}
else { /* wait for script */
ret = WaitForSingleObject(pi.hProcess, ma->ma_timeout*1000);
if (ret == WAIT_TIMEOUT) {
sprintf(log_buffer, "%s script %s: timed out",
ma->ma_name, ma->ma_script);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid,
log_buffer);
rc = -1;
} else if (ret == WAIT_FAILED) { /* abnormal exit */
sprintf(log_buffer, "%s script %s: wait failed %d",
ma->ma_name, ma->ma_script, GetLastError());
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer);
rc = -1;
} else {
GetExitCodeProcess(pi.hProcess, (DWORD *)&rc);
sprintf(log_buffer, "%s script %s: exit code %d",
ma->ma_name, ma->ma_script, rc);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid, log_buffer);
}
/* The following is iffy --
** should we close the handle or do it outside?
*/
if (ret != WAIT_TIMEOUT) {
CloseHandle(pi.hProcess);
}
}
init_envp();
#else
/*
** Special case for restart_transmogrify.
** The script is going to morf into the task so we have to
** setup pipes just like in start_process()
*/
if ((transmog = (ae == RestartAction) && restart_transmogrify)) {
sprintf(log_buffer, "action %s script %s preparing "
"to transmogrify task %8.8X",
ma->ma_name, ma->ma_script,
ptask->ti_qs.ti_task);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
if (pipe(pipes) == -1)
goto done;
if (pipes[1] < 3) {
kid_write = fcntl(pipes[1], F_DUPFD, 3);
close(pipes[1]);
}
else
kid_write = pipes[1];
parent_read = pipes[0];
if (pipe(pipes) == -1) {
close(kid_write);
close(parent_read);
goto done;
}
if (pipes[0] < 3) {
kid_read = fcntl(pipes[0], F_DUPFD, 3);
close(pipes[0]);
}
else
kid_read = pipes[0];
parent_write = pipes[1];
}
else if (ae == RestartAction) {
sprintf(log_buffer, "action %s script %s preparing "
"to restart task %8.8X",
ma->ma_name, ma->ma_script,
ptask->ti_qs.ti_task);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
}
if ((child = fork_me(-1)) == 0) { /* child */
extern char *variables_else[];
char *shell;
/* unprotect the child process which becomes the job */
daemon_protect(0, PBS_DAEMON_PROTECT_OFF);
shell = set_shell(pjob, pwdp); /* machine dependent */
vtable.v_ensize = 30;
vtable.v_used = 0;
vtable.v_envp = (char **)calloc(vtable.v_ensize,
sizeof(char *));
if (vtable.v_envp == NULL) {
free(args);
log_err(errno, "setup environment", "out of memory");
return -1;
}
/*
** Setup environment
*/
/* UID */
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exuid);
bld_env_variables(&vtable, "UID", buf);
/* GID */
sprintf(buf, "%d", pjob->ji_qs.ji_un.ji_momt.ji_exgid);
bld_env_variables(&vtable, "GID", buf);
/* HOME */
bld_env_variables(&vtable, variables_else[0], pwdp->pw_dir);
/* LOGNAME */
bld_env_variables(&vtable, variables_else[1], pwdp->pw_name);
/* PBS_JOBNAME */
bld_env_variables(&vtable, variables_else[2],
pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);
/* PBS_JOBID */
bld_env_variables(&vtable, variables_else[3],
pjob->ji_qs.ji_jobid);
/* PBS_QUEUE */
bld_env_variables(&vtable, variables_else[4],
pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str);
/* SHELL */
bld_env_variables(&vtable, variables_else[5], shell);
/* USER */
bld_env_variables(&vtable, variables_else[6], pwdp->pw_name);
/* PBS_JOBCOOKIE */
bld_env_variables(&vtable, variables_else[7],
pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str);
/* PBS_NODENUM */
sprintf(buf, "%d", pjob->ji_nodeid);
bld_env_variables(&vtable, variables_else[8], buf);
/* PBS_TASKNUM */
sprintf(buf, "%ld", (long)ptask->ti_qs.ti_task);
bld_env_variables(&vtable, variables_else[9], buf);
/* PBS_MOMPORT */
sprintf(buf, "%d", pbs_rm_port);
bld_env_variables(&vtable, variables_else[10], buf);
/* PBS_NODEFILE */
sprintf(buf, "%s/aux/%s", pbs_conf.pbs_home_path,
pjob->ji_qs.ji_jobid);
bld_env_variables(&vtable, variables_else[11], buf);
/* PBS_SID */
sprintf(buf, "%d", ptask->ti_qs.ti_sid);
bld_env_variables(&vtable, "PBS_SID", buf);
/* PBS_JOBDIR */
if ((pjob->ji_wattr[(int)JOB_ATR_sandbox].at_flags &
ATR_VFLAG_SET) &&
(strcasecmp(pjob->ji_wattr[JOB_ATR_sandbox].
at_val.at_str, "PRIVATE") == 0)) {
bld_env_variables(&vtable, "PBS_JOBDIR",
jobdirname(pjob->ji_qs.ji_jobid,
pjob->ji_grpcache->gc_homedir));
} else {
bld_env_variables(&vtable, "PBS_JOBDIR",
pjob->ji_grpcache->gc_homedir);
}
mom_unnice();
if (set_mach_vars(pjob, &vtable) != 0) {
log_event(PBSEVENT_JOB | PBSEVENT_SECURITY,
PBS_EVENTCLASS_JOB,
LOG_ERR, pjob->ji_qs.ji_jobid,
"failed to setup dependent environment!");
free(args);
return -1;
}
/*
** Do the same operations as start_process() but we don't
** need to reset the global ID.
*/
if (transmog) {
close(parent_read);
close(parent_write);
#if MOM_ALPS
/*
* ALPS jobs need a new PAGG when
* being restarted.
*/
memset(pjob->ji_extended.ji_ext.ji_4jid, 0,
sizeof(pjob->ji_extended.ji_ext.ji_4jid));
#endif
j = set_job(pjob, &sjr);
if (j < 0) {
if (j == -1) {
strcpy(log_buffer,
"Unable to set task session");
}
DBPRT(("%s: %s\n", __func__, log_buffer))
log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
LOG_NOTICE, pjob->ji_qs.ji_jobid,
log_buffer);
if (j == -3)
j = JOB_EXEC_FAIL2;
else
j = JOB_EXEC_RETRY;
starter_return(kid_write, kid_read, j, &sjr);
}
ptask->ti_qs.ti_sid = sjr.sj_session;
i = mom_set_limits(pjob, SET_LIMIT_SET);
if (i != PBSE_NONE) {
sprintf(log_buffer,
"Unable to set limits, err=%d", i);
log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
LOG_WARNING, pjob->ji_qs.ji_jobid,
log_buffer);
if (i == PBSE_RESCUNAV)
j = JOB_EXEC_RETRY;
else
j = JOB_EXEC_FAIL2;
starter_return(kid_write, kid_read, j, &sjr);
}
log_close(0);
starter_return(kid_write, kid_read, JOB_EXEC_OK, &sjr);
}
else { /* just close down anything hanging */
close(0);
close(1);
close(2);
}
execve(ma->ma_script, args, vtable.v_envp);
exit(254);
}
if (child == -1) { /* error */
sprintf(log_buffer, "action script %s cannot be run due to"
" fork failure %d", ma->ma_script, errno);
log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
goto done;
}
if (post != NULL) { /* post func means we do not wait */
rc = 1;
pjob->ji_momsubt = child;
pjob->ji_mompost = post;
if (ma->ma_timeout)
pjob->ji_actalarm = time_now + ma->ma_timeout;
else
pjob->ji_actalarm = 0;
goto done;
}
if (transmog) { /* setup new task */
close(kid_read);
close(kid_write);
/* read sid */
i = readpipe(parent_read, &sjr, sizeof(sjr));
j = errno;
close(parent_read);
if (i != sizeof(sjr)) {
sprintf(log_buffer,
"read of pipe for pid job %s got %d not %d",
pjob->ji_qs.ji_jobid, i, (int)sizeof(sjr));
log_err(j, __func__, log_buffer);
close(parent_write);
goto done;
}
/* send info back as an acknowlegment */
writepipe(parent_write, &sjr, sizeof(sjr));
close(parent_write);
DBPRT(("%s: read start return %d %d\n", __func__,
sjr.sj_code, sjr.sj_session))
/* update system specific ids and information from set_job() */
set_globid(pjob, &sjr);
if (sjr.sj_code < 0) {
sprintf(log_buffer, "task %8.8X not started, %s %d",
(unsigned int)ptask->ti_qs.ti_task,
(sjr.sj_code == JOB_EXEC_RETRY) ?
"Retry" : "Failure",
sjr.sj_code);
log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
LOG_NOTICE, pjob->ji_qs.ji_jobid, log_buffer);
goto done;
}
ptask->ti_qs.ti_sid = sjr.sj_session;
ptask->ti_qs.ti_status = TI_STATE_RUNNING;
(void)task_save(ptask);
/* update the job with the new session id */
pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long
= sjr.sj_session;
pjob->ji_wattr[(int)JOB_ATR_session_id].at_flags =
ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
pjob->ji_qs.ji_state = JOB_STATE_RUNNING;
pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
job_save(pjob, SAVEJOB_QUICK);
}
rc = 0;
sprintf(log_buffer, "task %8.8X transmogrified",
(unsigned int)ptask->ti_qs.ti_task);
log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
pjob->ji_qs.ji_jobid, log_buffer);
/* send updated session id to server */
update_ajob_status(pjob);
}
else { /* wait for script */
DBPRT(("action: setting alarm %d\n", ma->ma_timeout))
(void)alarm(ma->ma_timeout);
rc = 0;
if (waitpid(child, &rc, 0) == -1) {
sprintf(log_buffer,
"%s script %s: wait failed %d",
ma->ma_name, ma->ma_script, errno);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid,
log_buffer);
(void)kill(child, SIGKILL);
(void)waitpid(child, &rc, 0);
}
(void)alarm(0);
if (WIFEXITED(rc)) {
rc = WEXITSTATUS(rc);
sprintf(log_buffer,
"%s script %s: exit code %d",
ma->ma_name, ma->ma_script, rc);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid,
log_buffer);
if (rc != 0)
rc = -1;
} else if (WIFSIGNALED(rc)) {
rc = WTERMSIG(rc);
sprintf(log_buffer,
"%s script %s: got signal %d",
ma->ma_name, ma->ma_script, rc);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid,
log_buffer);
rc = -1;
} else {
sprintf(log_buffer,
"%s script %s: exited abnormally",
ma->ma_name, ma->ma_script);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
LOG_INFO, pjob->ji_qs.ji_jobid,
log_buffer);
rc = -1;
}
}
#endif /* WIN32 */
done:
/* free args arrays */
for (pargs = args; *pargs; pargs++)
(void)free(*pargs);
(void)free(args);
return rc;
}
/**
* @brief
* set the suspend (and resume) signal used
*
* @param[in] str - signal name
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_suspend_signal(char *str)
{
char tok[80];
if ((str == 0) || (*str == '\0'))
return HANDLER_FAIL;
str = TOKCPY(str, tok);
str = skipwhite(str);
suspend_signal = atoi(tok);
if (*str != '\0')
resume_signal = atoi(str);
return HANDLER_SUCCESS;
}
/**
* @brief
* Add static resource or shell escape line from config file.
* This is a support routine for read_config().
*
* @param[in] str - string holding resource name
* @param[in] file - filename
* @param[in] linenum - line number in file
*
* @return int
*
* @retval 1 - In case of error
* @retval 0 - In case of success
*
*/
static int
add_static(char *str, char *file, int linenum)
{
int i;
char name[256];
struct config_list *cp;
str = TOKCPY(str, name);/* resource name */
str = skipwhite(str); /* resource value */
if (*str == '!') { /* shell escape command */
int err;
char *filename;
rmnl(str);
filename = get_script_name(&str[1]);
if (filename == NULL)
return 1;
#ifdef WIN32
err = tmp_file_sec(filename, 0, 1, WRITES_MASK, 1);
#else
err = tmp_file_sec(filename, 0, 1, S_IWGRP|S_IWOTH, 1);
#endif
if (err != 0) {
snprintf(log_buffer, sizeof(log_buffer),
"error: %s file has a non-secure file access, errno: %d", filename, err);
log_event(PBSEVENT_SECURITY, PBS_EVENTCLASS_SERVER, LOG_ERR, __func__, log_buffer);
free(filename);
return 1;
}
free(filename);
}
else { /* get the value */
i = strlen(str);
while (--i) { /* strip trailing blanks */
if (!isspace((int)*(str+i)))
break;
*(str+i) = '\0';
}
}
cp = (struct config_list *)malloc(sizeof(struct config_list));
memcheck((char *)cp);
cp->c_link = config_list;
cp->c.c_name = strdup(name);
memcheck(cp->c.c_name);
cp->c.c_u.c_value = strdup(str);
memcheck(cp->c.c_u.c_value);
snprintf(log_buffer, sizeof(log_buffer), "%s[%d] add name %s value %s",
file, linenum, name, str);
log_event(PBSEVENT_DEBUG, 0, LOG_DEBUG, "add_static", log_buffer);
config_list = cp;
return 0;
}
/**
* @brief
* sets ideal load
*
* @param[in] value - value for ideal load
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
setidealload(char *value)
{
char newstr[50] = "ideal_load ";
float val;
val = (float)atof(value);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG,
"ideal_load", value);
if (val < 0.0)
return HANDLER_FAIL; /* error */
ideal_load_val = val;
if (max_load_val < 0.0)
max_load_val = val; /* set a default */
(void)strcat(newstr, value);
if (add_static(newstr, "config", 0))
return HANDLER_FAIL;
nconfig++;
return HANDLER_SUCCESS;
}
/**
* @brief
* sets maximum load
*
* @param[in] value - value for maximum load
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
setmaxload(char *value)
{
char newstr[50] = "max_load ";
char *endptr;
float val;
endptr = value;
while ((!isspace((int)*endptr)) && *endptr)
endptr++;
val = (float)atof(value);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_DEBUG,
"max_load", value);
if (val < 0.0)
return HANDLER_FAIL; /* error */
max_load_val = val;
if (ideal_load_val < 0.0)
ideal_load_val = val;
(void)strncat(newstr, value, 40);
if (add_static(newstr, "config", 0))
return HANDLER_FAIL;
nconfig++;
if (*endptr != '\0') {
if (strstr(endptr, "suspend"))
idle_on_maxload = 1;
}
return HANDLER_SUCCESS;
}
/**
* process $max_poll_downtime directive in config file:
* $max_poll_downtime 300
*/
static handler_ret_t
set_max_poll_downtime(char *value)
{
char *sbuf;
char *ebuf;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
LOG_INFO, "max_poll_downtime", value);
sbuf = value;
max_poll_downtime_val = (time_t)strtol(sbuf, &ebuf, 10);
if (max_poll_downtime_val <= 0)
return HANDLER_FAIL; /* error */
return HANDLER_SUCCESS;
}
/**
* @brief
* process $kbd_idle directive in config file:
* $kbidle avail [busy]
*
* @param[in] value - value for kb idle
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_kbd_idle(char *value)
{
char *sbuf;
char *ebuf;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
LOG_INFO, "idle_avail", value);
sbuf = value;
idle_avail = (time_t)strtol(sbuf, &ebuf, 10);
if (idle_avail <= 0)
return HANDLER_FAIL; /* error */
idle_check = 1;
cycle_harvester = 1;
sbuf = ebuf;
while (isspace((int)*sbuf))
++sbuf;
if (*sbuf == '\0')
goto chk_for_interactive; /* no idle_busy, but that is ok */
idle_busy = (time_t)strtol(sbuf, &ebuf, 10);
if (idle_busy <= 0)
return HANDLER_FAIL; /* error */
sbuf = ebuf;
while (isspace((int)*sbuf))
++sbuf;
if (*sbuf == '\0')
goto chk_for_interactive; /* no idle_poll, but that is ok */
idle_poll = (time_t)strtol(sbuf, &ebuf, 10);
if (idle_poll <= 0)
return HANDLER_FAIL; /* error */
/* check whether PBS_INTERACTIVE service is registered or not? */
chk_for_interactive:
#ifdef WIN32
/* create idle_touch and idle_poll_time file in PBS_HOME/spool/
* idle_touch file - To get kbd/mouse time information from pbs_idled process
* idle_poll_time file - To inform about idle_poll time from MOM config to pbs_idled process
*/
if (create_idle_files())
return HANDLER_FAIL;
/* Check whether PBS_INTERACTIVE service is registered or not? */
if (check_pbs_interactive()) {
/* check for registration of PBS_INTERACTIVE service is failed, return failure */
return HANDLER_FAIL;
} else {
/* check for registration of PBS_INTERACTIVE service is successed
* trying to start PBS_INTERACTIVE service */
if (start_pbs_interactive()) {
/* starting of PBS_INTERACTIVE service is failed, return failure */
return HANDLER_FAIL;
}
}
#endif
return HANDLER_SUCCESS;
}
/**
* @brief
* sets temporary dirctory
*
* @param[in] value - value for temp directory
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_tmpdir(char *value)
{
char *cleaned_value;
int i;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
LOG_INFO, "tmpdir", value);
cleaned_value = remove_quotes(value); /* remove quotes if any present */
if (cleaned_value == NULL)
return HANDLER_FAIL;
/* Remove trailing separator */
for (i = (strlen(cleaned_value) - 1); i >= 0; i--) {
#ifdef WIN32
if (cleaned_value[i] != '\\')
#else
if (cleaned_value[i] != '/')
#endif
break;
cleaned_value[i] = '\0';
}
if (strlen(cleaned_value) > sizeof(pbs_tmpdir)-1) {
free(cleaned_value);
return HANDLER_FAIL;
}
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
#ifdef WIN32
if (check_directory(cleaned_value, "set_tmpdir") == -1) {
free(cleaned_value);
return HANDLER_FAIL;
}
#else /* Unix Only */
if (tmp_file_sec(cleaned_value, 1, 1, 0, 1)) {
free(cleaned_value);
return HANDLER_FAIL; /* error */
}
#endif /* WIN32 */
#endif /* NO_SECURITY_CHECK */
strcpy(pbs_tmpdir, cleaned_value);
free(cleaned_value);
return HANDLER_SUCCESS;
}
/**
* @brief
* sets job dirctory
*
* @param[in] value - value for job directory
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
static handler_ret_t
set_jobdir_root(char *value)
{
char *cleaned_value;
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER,
LOG_INFO, __func__, value);
cleaned_value = remove_quotes(value); /* remove quotes if any present */
if (cleaned_value == NULL)
return HANDLER_FAIL;
if (strlen(cleaned_value) > sizeof(pbs_jobdir_root)-1) {
free(cleaned_value);
return HANDLER_FAIL;
}
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
#ifdef WIN32
if (check_directory(cleaned_value, __func__) == -1) {
free(cleaned_value);
return HANDLER_FAIL;
}
#else /* UNIX only */
if (tmp_file_sec(cleaned_value, 1, 1, 0, 1)) {
free(cleaned_value);
return HANDLER_FAIL;
}
#endif /* WIN32 */
#endif /* NO_SECURITY_CHECK */
strcpy(pbs_jobdir_root, cleaned_value);
free(cleaned_value);
return HANDLER_SUCCESS;
}
/**
* @brief
* sets boolean value
*
* @param[in] id - function name
* @param[in] value - value
* @param[in] flag - configuration flag
*
* @return handler_ret_t
* @retval HANDLER_FAIL Failure
* @retval HANDLER_SUCCESS Success
*
*/
handler_ret_t
set_boolean(const char *id, char *value, int *flag)
{
if (value == NULL || *value == '\0') {
sprintf(log_buffer, "No value specified, no action taken.");
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
id, log_buffer);
return HANDLER_FAIL; /* error */
}
if ((strcasecmp(value, "no") == 0) ||
(strcasecmp(value, "false") == 0) ||
(strcasecmp(value, "off") == 0) ||
(strcmp(value, "0") == 0)) {
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
id, "false");
*flag = FALSE;
} else if ((strcasecmp(value, "yes") == 0) ||
(strcasecmp(value, "true") == 0) ||
(strcasecmp(value, "on") == 0) ||
(strcmp(value, "1") == 0)) {
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
id, "true");
*flag = TRUE;
} else {
sprintf(log_buffer,
"Illegal value \"%s\", no action taken.", value);
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
id, log_buffer);
return HANDLER_FAIL; /* error */
}
return HANDLER_SUCCESS; /* success */
}
static handler_ret_t
set_int(const char *id, char *value, int *var)
{
char *left;
int val;
if (value == NULL || *value == '\0') {
sprintf(log_buffer, "No value specified, no action taken.");
log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE,
id, log_buffer);
return HANDLER_FAIL; /* error */
}
val = (int)strtol(value, &left, 0);
if (*left != '\0' || val <= 0) {
sprintf(log_buffer, "bad value \"%s\"", value);
log_event(PBSEVENT_SYSTEM, 0, LOG_ERR, id, log_buffer);
return HANDLER_FAIL; /* error */
}
*var = val;
sprintf(log_buffer, "setting %d", val);
log_event(PBSEVENT_SYSTEM, 0, LOG_DEBUG, id, log_buffer);
return HANDLER_SUCCESS;
}
/**
* @brief
* set float value
*
* @param[in] id - function name
* @param[in] value - value