Browse files

Revert "sched/nohz: Rewrite and fix load-avg computation -- again"

This reverts commit 7490d0a as it
breaks BFS.
  • Loading branch information...
1 parent db14161 commit df5a59c680a1fd3cdc9a77894d0c9b8adee9be23 Oleksandr Natalenko committed Jul 20, 2012
Showing with 75 additions and 213 deletions.
  1. +0 −8 include/linux/sched.h
  2. +72 −203 kernel/sched/core.c
  3. +1 −0 kernel/sched/idle_task.c
  4. +2 −0 kernel/sched/sched.h
  5. +0 −2 kernel/time/tick-sched.c
View
8 include/linux/sched.h
@@ -2025,14 +2025,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
}
#endif
-#ifdef CONFIG_NO_HZ
-void calc_load_enter_idle(void);
-void calc_load_exit_idle(void);
-#else
-static inline void calc_load_enter_idle(void) { }
-static inline void calc_load_exit_idle(void) { }
-#endif /* CONFIG_NO_HZ */
-
#ifndef CONFIG_CPUMASK_OFFSTACK
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
View
275 kernel/sched/core.c
@@ -2162,73 +2162,11 @@ unsigned long this_cpu_load(void)
}
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- * nr_active = 0;
- * for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- * - for_each_possible_cpu() is prohibitively expensive on machines with
- * serious number of cpus, therefore we need to take a distributed approach
- * to calculating nr_active.
- *
- * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- * So assuming nr_active := 0 when we start out -- true per definition, we
- * can simply take per-cpu deltas and fold those into a global accumulate
- * to obtain the same result. See calc_load_fold_active().
- *
- * Furthermore, in order to avoid synchronizing all per-cpu delta folding
- * across the machine, we assume 10 ticks is sufficient time for every
- * cpu to have completed this task.
- *
- * This places an upper-bound on the IRQ-off latency of the machine. Then
- * again, being late doesn't loose the delta, just wrecks the sample.
- *
- * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- * this would add another cross-cpu cacheline miss and atomic operation
- * to the wakeup path. Instead we increment on whatever cpu the task ran
- * when it went into uninterruptible state and decrement on whatever cpu
- * did the wakeup. This means that only the sum of nr_uninterruptible over
- * all cpus yields the correct result.
- *
- * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+EXPORT_SYMBOL(avenrun);
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -2245,9 +2183,6 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
-/*
- * a1 = a0 * e + a * (1 - e)
- */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2259,118 +2194,30 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- * - When we go NO_HZ idle during the window, we can negate our sample
- * contribution, causing under-accounting.
- *
- * We avoid this by keeping two idle-delta counters and flipping them
- * when the window starts, thus separating old and new NO_HZ load.
- *
- * The only trick is the slight shift in index flip for read vs write.
- *
- * 0s 5s 10s 15s
- * +10 +10 +10 +10
- * |-|-----------|-|-----------|-|-----------|-|
- * r:0 0 1 1 0 0 1 1 0
- * w:0 1 1 0 0 1 1 0 0
- *
- * This ensures we'll fold the old idle contribution in this window while
- * accumlating the new one.
- *
- * - When we wake up from NO_HZ idle during the window, we push up our
- * contribution, since we effectively move our sample point to a known
- * busy state.
- *
- * This is solved by pushing the window forward, and thus skipping the
- * sample, for this cpu (effectively using the idle-delta for this cpu which
- * was in effect at the time the window opened). This also solves the issue
- * of having to deal with a cpu having been in NOHZ idle for multiple
- * LOAD_FREQ intervals.
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
+static atomic_long_t calc_load_tasks_idle;
-static inline int calc_load_write_idx(void)
+void calc_load_account_idle(struct rq *this_rq)
{
- int idx = calc_load_idx;
-
- /*
- * See calc_global_nohz(), if we observe the new index, we also
- * need to observe the new update time.
- */
- smp_rmb();
-
- /*
- * If the folding window started, make sure we start writing in the
- * next idle-delta.
- */
- if (!time_before(jiffies, calc_load_update))
- idx++;
-
- return idx & 1;
-}
-
-static inline int calc_load_read_idx(void)
-{
- return calc_load_idx & 1;
-}
-
-void calc_load_enter_idle(void)
-{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NOHZ mode, if there's any pending delta, fold it
- * into the pending idle delta.
- */
delta = calc_load_fold_active(this_rq);
- if (delta) {
- int idx = calc_load_write_idx();
- atomic_long_add(delta, &calc_load_idle[idx]);
- }
-}
-
-void calc_load_exit_idle(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * If we're still before the sample window, we're done.
- */
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- /*
- * We woke inside or after the sample window, this means we're already
- * accounted through the nohz accounting, so skip the entire deal and
- * sync up for the next window.
- */
- this_rq->calc_load_update = calc_load_update;
- if (time_before(jiffies, this_rq->calc_load_update + 10))
- this_rq->calc_load_update += LOAD_FREQ;
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks_idle);
}
static long calc_load_fold_idle(void)
{
- int idx = calc_load_read_idx();
long delta = 0;
- if (atomic_long_read(&calc_load_idle[idx]))
- delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+ /*
+ * Its got a race, we don't care...
+ */
+ if (atomic_long_read(&calc_load_tasks_idle))
+ delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
return delta;
}
@@ -2456,58 +2303,78 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- if (!time_before(jiffies, calc_load_update + 10)) {
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
- calc_load_update += n * LOAD_FREQ;
- }
+ /*
+ * It could be the one fold was all it took, we done!
+ */
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
/*
- * Flip the idle index...
- *
- * Make sure we first write the new time then flip the index, so that
- * calc_load_write_idx() will see the new time when it reads the new
- * index, this avoids a double flip messing things up.
+ * Catch-up, fold however many we are behind still
*/
- smp_wmb();
- calc_load_idx++;
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+}
+#else
+void calc_load_account_idle(struct rq *this_rq)
+{
}
-#else /* !CONFIG_NO_HZ */
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
+static inline long calc_load_fold_idle(void)
+{
+ return 0;
+}
-#endif /* CONFIG_NO_HZ */
+static void calc_global_nohz(void)
+{
+}
+#endif
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
void calc_global_load(unsigned long ticks)
{
- long active, delta;
+ long active;
if (time_before(jiffies, calc_load_update + 10))
return;
- /*
- * Fold the 'old' idle-delta to include all NO_HZ cpus.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -2518,7 +2385,12 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ * Account one period with whatever state we found before
+ * folding in the nohz state and ageing the entire idle period.
+ *
+ * This avoids loosing a sample when we go idle between
+ * calc_load_account_active() (10 ticks ago) and now and thus
+ * under-accounting.
*/
calc_global_nohz();
}
@@ -2535,17 +2407,14 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
+ delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
this_rq->calc_load_update += LOAD_FREQ;
}
/*
- * End of global load-average stuff
- */
-
-/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
View
1 kernel/sched/idle_task.c
@@ -25,6 +25,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
+ calc_load_account_idle(rq);
return rq->idle;
}
View
2 kernel/sched/sched.h
@@ -940,6 +940,8 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
+void calc_load_account_idle(struct rq *this_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
View
2 kernel/time/tick-sched.c
@@ -401,7 +401,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
*/
if (!ts->tick_stopped) {
select_nohz_load_balancer(1);
- calc_load_enter_idle();
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -592,7 +591,6 @@ void tick_nohz_idle_exit(void)
account_idle_ticks(ticks);
#endif
- calc_load_exit_idle();
touch_softlockup_watchdog();
/*
* Cancel the scheduled timer and restore the tick

0 comments on commit df5a59c

Please sign in to comment.