Permalink
Browse files

Make memory types in acct_gather uint64_t to handle systems with more…

… than

4TB of memory on them.
  • Loading branch information...
dannyauble committed Mar 3, 2014
1 parent 0500ea4 commit 98988535b646062b34c6aba81ea21db9be7a37fe
View
2 NEWS
@@ -29,6 +29,8 @@ documents those changes that are of interest to users and admins.
work for now).
-- Add SchedulerParameters options of preempt_reorder_count and
preempt_strict_order.
+ -- Make memory types in acct_gather uint64_t to handle systems with more than
+ 4TB of memory on them.
* Changes in Slurm 14.03.0pre6
==============================
View
@@ -471,15 +471,15 @@ enum jobacct_data_type {
JOBACCT_DATA_PIPE, /* data-> file descriptor */
JOBACCT_DATA_RUSAGE, /* data-> rusage set user_cpu_sec,
* user_cpu_usec, sys_cpu_sec, sys_cpu_usec */
- JOBACCT_DATA_MAX_VSIZE, /* data-> uint32_t vsize */
+ JOBACCT_DATA_MAX_VSIZE, /* data-> uint64_t vsize */
JOBACCT_DATA_MAX_VSIZE_ID, /* data-> jobacct_id_t vsize */
- JOBACCT_DATA_TOT_VSIZE, /* data-> uint32_t vsize */
- JOBACCT_DATA_MAX_RSS, /* data-> uint32_t psize */
+ JOBACCT_DATA_TOT_VSIZE, /* data-> uint64_t vsize */
+ JOBACCT_DATA_MAX_RSS, /* data-> uint64_t psize */
JOBACCT_DATA_MAX_RSS_ID, /* data-> jobacct_id_t psize */
- JOBACCT_DATA_TOT_RSS, /* data-> uint32_t psize */
- JOBACCT_DATA_MAX_PAGES, /* data-> uint32_t psize */
+ JOBACCT_DATA_TOT_RSS, /* data-> uint64_t psize */
+ JOBACCT_DATA_MAX_PAGES, /* data-> uint64_t psize */
JOBACCT_DATA_MAX_PAGES_ID, /* data-> jobacct_id_t psize */
- JOBACCT_DATA_TOT_PAGES, /* data-> uint32_t psize */
+ JOBACCT_DATA_TOT_PAGES, /* data-> uint64_t psize */
JOBACCT_DATA_MIN_CPU, /* data-> uint32_t psize */
JOBACCT_DATA_MIN_CPU_ID, /* data-> jobacct_id_t psize */
JOBACCT_DATA_TOT_CPU, /* data-> uint32_t psize */
View
@@ -324,15 +324,15 @@ typedef struct {
uint32_t disk_write_max_nodeid; /* contains node number max was on */
uint32_t disk_write_max_taskid;/* contains task number max was on */
double pages_ave;
- uint32_t pages_max;
+ uint64_t pages_max;
uint32_t pages_max_nodeid; /* contains which node number it was on */
uint32_t pages_max_taskid; /* contains which task number it was on */
double rss_ave;
- uint32_t rss_max;
+ uint64_t rss_max;
uint32_t rss_max_nodeid; /* contains which node number it was on */
uint32_t rss_max_taskid; /* contains which task number it was on */
double vsize_ave;
- uint32_t vsize_max;
+ uint64_t vsize_max;
uint32_t vsize_max_nodeid; /* contains which node number it was on */
uint32_t vsize_max_taskid; /* contains which task number it was on */
} slurmdb_stats_t;
@@ -115,8 +115,8 @@ static bool plugin_polling = true;
static uint32_t jobacct_job_id = 0;
static uint32_t jobacct_step_id = 0;
-static uint32_t jobacct_mem_limit = 0;
-static uint32_t jobacct_vmem_limit = 0;
+static uint64_t jobacct_mem_limit = 0;
+static uint64_t jobacct_vmem_limit = 0;
/* _acct_kill_step() issue RPC to kill a slurm job step */
static void _acct_kill_step(void)
@@ -542,29 +542,32 @@ extern int jobacct_gather_set_mem_limit(uint32_t job_id, uint32_t step_id,
}
extern void jobacct_gather_handle_mem_limit(
- uint32_t total_job_mem, uint32_t total_job_vsize)
+ uint64_t total_job_mem, uint64_t total_job_vsize)
{
if (!plugin_polling)
return;
if (jobacct_mem_limit) {
if (jobacct_step_id == NO_VAL) {
- debug("Job %u memory used:%u limit:%u KB",
+ debug("Job %u memory used:%"PRIu64" limit:%"PRIu64" KB",
jobacct_job_id, total_job_mem, jobacct_mem_limit);
} else {
- debug("Step %u.%u memory used:%u limit:%u KB",
+ debug("Step %u.%u memory used:%"PRIu64" "
+ "limit:%"PRIu64" KB",
jobacct_job_id, jobacct_step_id,
total_job_mem, jobacct_mem_limit);
}
}
if (jobacct_job_id && jobacct_mem_limit &&
(total_job_mem > jobacct_mem_limit)) {
if (jobacct_step_id == NO_VAL) {
- error("Job %u exceeded memory limit (%u > %u), being "
+ error("Job %u exceeded memory limit "
+ "(%"PRIu64" > %"PRIu64"), being "
"killed", jobacct_job_id, total_job_mem,
jobacct_mem_limit);
} else {
- error("Step %u.%u exceeded memory limit (%u > %u), "
+ error("Step %u.%u exceeded memory limit "
+ "(%"PRIu64" > %"PRIu64"), "
"being killed", jobacct_job_id, jobacct_step_id,
total_job_mem, jobacct_mem_limit);
}
@@ -573,11 +576,13 @@ extern void jobacct_gather_handle_mem_limit(
(total_job_vsize > jobacct_vmem_limit)) {
if (jobacct_step_id == NO_VAL) {
error("Job %u exceeded virtual memory limit "
- "(%u > %u), being killed", jobacct_job_id,
+ "(%"PRIu64" > %"PRIu64"), being killed",
+ jobacct_job_id,
total_job_vsize, jobacct_vmem_limit);
} else {
error("Step %u.%u exceeded virtual memory limit "
- "(%u > %u), being killed", jobacct_job_id,
+ "(%"PRIu64" > %"PRIu64"), being killed",
+ jobacct_job_id,
jobacct_step_id, total_job_vsize,
jobacct_vmem_limit);
}
@@ -646,6 +651,7 @@ extern int jobacctinfo_setinfo(jobacctinfo_t *jobacct,
int *fd = (int *)data;
struct rusage *rusage = (struct rusage *)data;
uint32_t *uint32 = (uint32_t *) data;
+ uint64_t *uint64 = (uint64_t *) data;
double *dub = (double *) data;
jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
struct jobacctinfo *send = (struct jobacctinfo *) data;
@@ -750,31 +756,31 @@ extern int jobacctinfo_setinfo(jobacctinfo_t *jobacct,
jobacct->sys_cpu_usec = rusage->ru_stime.tv_usec;
break;
case JOBACCT_DATA_MAX_RSS:
- jobacct->max_rss = *uint32;
+ jobacct->max_rss = *uint64;
break;
case JOBACCT_DATA_MAX_RSS_ID:
jobacct->max_rss_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_RSS:
- jobacct->tot_rss = *uint32;
+ jobacct->tot_rss = *uint64;
break;
case JOBACCT_DATA_MAX_VSIZE:
- jobacct->max_vsize = *uint32;
+ jobacct->max_vsize = *uint64;
break;
case JOBACCT_DATA_MAX_VSIZE_ID:
jobacct->max_vsize_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_VSIZE:
- jobacct->tot_vsize = *uint32;
+ jobacct->tot_vsize = *uint64;
break;
case JOBACCT_DATA_MAX_PAGES:
- jobacct->max_pages = *uint32;
+ jobacct->max_pages = *uint64;
break;
case JOBACCT_DATA_MAX_PAGES_ID:
jobacct->max_pages_id = *jobacct_id;
break;
case JOBACCT_DATA_TOT_PAGES:
- jobacct->tot_pages = *uint32;
+ jobacct->tot_pages = *uint64;
break;
case JOBACCT_DATA_MIN_CPU:
jobacct->min_cpu = *uint32;
@@ -825,6 +831,7 @@ extern int jobacctinfo_getinfo(
int rc = SLURM_SUCCESS;
int *fd = (int *)data;
uint32_t *uint32 = (uint32_t *) data;
+ uint64_t *uint64 = (uint64_t *) data;
double *dub = (double *) data;
jobacct_id_t *jobacct_id = (jobacct_id_t *) data;
struct rusage *rusage = (struct rusage *)data;
@@ -931,31 +938,31 @@ extern int jobacctinfo_getinfo(
rusage->ru_stime.tv_usec = jobacct->sys_cpu_usec;
break;
case JOBACCT_DATA_MAX_RSS:
- *uint32 = jobacct->max_rss;
+ *uint64 = jobacct->max_rss;
break;
case JOBACCT_DATA_MAX_RSS_ID:
*jobacct_id = jobacct->max_rss_id;
break;
case JOBACCT_DATA_TOT_RSS:
- *uint32 = jobacct->tot_rss;
+ *uint64 = jobacct->tot_rss;
break;
case JOBACCT_DATA_MAX_VSIZE:
- *uint32 = jobacct->max_vsize;
+ *uint64 = jobacct->max_vsize;
break;
case JOBACCT_DATA_MAX_VSIZE_ID:
*jobacct_id = jobacct->max_vsize_id;
break;
case JOBACCT_DATA_TOT_VSIZE:
- *uint32 = jobacct->tot_vsize;
+ *uint64 = jobacct->tot_vsize;
break;
case JOBACCT_DATA_MAX_PAGES:
- *uint32 = jobacct->max_pages;
+ *uint64 = jobacct->max_pages;
break;
case JOBACCT_DATA_MAX_PAGES_ID:
*jobacct_id = jobacct->max_pages_id;
break;
case JOBACCT_DATA_TOT_PAGES:
- *uint32 = jobacct->tot_pages;
+ *uint64 = jobacct->tot_pages;
break;
case JOBACCT_DATA_MIN_CPU:
*uint32 = jobacct->min_cpu;
@@ -1027,7 +1034,9 @@ extern void jobacctinfo_pack(jobacctinfo_t *jobacct,
}
pack8((uint8_t) 1, buffer);
if (!jobacct) {
- for (i = 0; i < 14; i++)
+ for (i = 0; i < 6; i++)
+ pack64(0, buffer);
+ for (i = 0; i < 8; i++)
pack32((uint32_t) 0, buffer);
for (i = 0; i < 4; i++)
packdouble((double) 0, buffer);
@@ -1040,12 +1049,12 @@ extern void jobacctinfo_pack(jobacctinfo_t *jobacct,
pack32((uint32_t)jobacct->user_cpu_usec, buffer);
pack32((uint32_t)jobacct->sys_cpu_sec, buffer);
pack32((uint32_t)jobacct->sys_cpu_usec, buffer);
- pack32((uint32_t)jobacct->max_vsize, buffer);
- pack32((uint32_t)jobacct->tot_vsize, buffer);
- pack32((uint32_t)jobacct->max_rss, buffer);
- pack32((uint32_t)jobacct->tot_rss, buffer);
- pack32((uint32_t)jobacct->max_pages, buffer);
- pack32((uint32_t)jobacct->tot_pages, buffer);
+ pack64(jobacct->max_vsize, buffer);
+ pack64(jobacct->tot_vsize, buffer);
+ pack64(jobacct->max_rss, buffer);
+ pack64(jobacct->tot_rss, buffer);
+ pack64(jobacct->max_pages, buffer);
+ pack64(jobacct->tot_pages, buffer);
pack32((uint32_t)jobacct->min_cpu, buffer);
pack32((uint32_t)jobacct->tot_cpu, buffer);
pack32((uint32_t)jobacct->act_cpufreq, buffer);
@@ -1179,12 +1188,12 @@ extern int jobacctinfo_unpack(jobacctinfo_t **jobacct,
(*jobacct)->sys_cpu_sec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->sys_cpu_usec = uint32_tmp;
- safe_unpack32(&(*jobacct)->max_vsize, buffer);
- safe_unpack32(&(*jobacct)->tot_vsize, buffer);
- safe_unpack32(&(*jobacct)->max_rss, buffer);
- safe_unpack32(&(*jobacct)->tot_rss, buffer);
- safe_unpack32(&(*jobacct)->max_pages, buffer);
- safe_unpack32(&(*jobacct)->tot_pages, buffer);
+ safe_unpack64(&(*jobacct)->max_vsize, buffer);
+ safe_unpack64(&(*jobacct)->tot_vsize, buffer);
+ safe_unpack64(&(*jobacct)->max_rss, buffer);
+ safe_unpack64(&(*jobacct)->tot_rss, buffer);
+ safe_unpack64(&(*jobacct)->max_pages, buffer);
+ safe_unpack64(&(*jobacct)->tot_pages, buffer);
safe_unpack32(&(*jobacct)->min_cpu, buffer);
safe_unpack32(&(*jobacct)->tot_cpu, buffer);
safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
@@ -1226,12 +1235,12 @@ extern int jobacctinfo_unpack(jobacctinfo_t **jobacct,
(*jobacct)->sys_cpu_sec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->sys_cpu_usec = uint32_tmp;
- safe_unpack32(&(*jobacct)->max_vsize, buffer);
- safe_unpack32(&(*jobacct)->tot_vsize, buffer);
- safe_unpack32(&(*jobacct)->max_rss, buffer);
- safe_unpack32(&(*jobacct)->tot_rss, buffer);
- safe_unpack32(&(*jobacct)->max_pages, buffer);
- safe_unpack32(&(*jobacct)->tot_pages, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_vsize, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_vsize, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_rss, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_rss, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_pages, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_pages, buffer);
safe_unpack32(&(*jobacct)->min_cpu, buffer);
safe_unpack32(&(*jobacct)->tot_cpu, buffer);
safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
@@ -1273,12 +1282,12 @@ extern int jobacctinfo_unpack(jobacctinfo_t **jobacct,
(*jobacct)->sys_cpu_sec = uint32_tmp;
safe_unpack32(&uint32_tmp, buffer);
(*jobacct)->sys_cpu_usec = uint32_tmp;
- safe_unpack32(&(*jobacct)->max_vsize, buffer);
- safe_unpack32(&(*jobacct)->tot_vsize, buffer);
- safe_unpack32(&(*jobacct)->max_rss, buffer);
- safe_unpack32(&(*jobacct)->tot_rss, buffer);
- safe_unpack32(&(*jobacct)->max_pages, buffer);
- safe_unpack32(&(*jobacct)->tot_pages, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_vsize, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_vsize, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_rss, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_rss, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->max_pages, buffer);
+ safe_unpack32((uint32_t *)&(*jobacct)->tot_pages, buffer);
safe_unpack32(&(*jobacct)->min_cpu, buffer);
safe_unpack32(&(*jobacct)->tot_cpu, buffer);
safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
@@ -101,17 +101,17 @@ struct jobacctinfo {
uint32_t sys_cpu_usec;
uint32_t user_cpu_sec;
uint32_t user_cpu_usec;
- uint32_t max_vsize; /* max size of virtual memory */
+ uint64_t max_vsize; /* max size of virtual memory */
jobacct_id_t max_vsize_id; /* contains which task number it was on */
- uint32_t tot_vsize; /* total virtual memory
+ uint64_t tot_vsize; /* total virtual memory
(used to figure out ave later) */
- uint32_t max_rss; /* max Resident Set Size */
+ uint64_t max_rss; /* max Resident Set Size */
jobacct_id_t max_rss_id; /* contains which task it was on */
- uint32_t tot_rss; /* total rss
+ uint64_t tot_rss; /* total rss
(used to figure out ave later) */
- uint32_t max_pages; /* max pages */
+ uint64_t max_pages; /* max pages */
jobacct_id_t max_pages_id; /* contains which task it was on */
- uint32_t tot_pages; /* total pages
+ uint64_t tot_pages; /* total pages
(used to figure out ave later) */
uint32_t min_cpu; /* min cpu time */
jobacct_id_t min_cpu_id; /* contains which task it was on */
@@ -155,7 +155,7 @@ extern int jobacct_gather_set_proctrack_container_id(uint64_t id);
extern int jobacct_gather_set_mem_limit(uint32_t job_id, uint32_t step_id,
uint32_t mem_limit);
extern void jobacct_gather_handle_mem_limit(
- uint32_t total_job_mem, uint32_t total_job_vsize);
+ uint64_t total_job_mem, uint64_t total_job_vsize);
extern jobacctinfo_t *jobacctinfo_create(jobacct_id_t *jobacct_id);
extern void jobacctinfo_destroy(void *object);
Oops, something went wrong.

0 comments on commit 9898853

Please sign in to comment.