bencheeorg · PragTob · Oct 12, 2017 · Oct 10, 2017 · Oct 10, 2017 · Oct 10, 2017
diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex
@@ -5,6 +5,7 @@ defmodule Benchee.Statistics do
   """
 
   alias Benchee.Statistics.Mode
+  alias Benchee.Statistics.Percentile
 
   defstruct [:average, :ips, :std_dev, :std_dev_ratio, :std_dev_ips, :median,
              :mode, :minimum, :maximum, :sample_size]
@@ -159,7 +160,7 @@ defmodule Benchee.Statistics do
     deviation           = standard_deviation(run_times, average, iterations)
     standard_dev_ratio  = deviation / average
     standard_dev_ips    = ips * standard_dev_ratio
-    median              = compute_median(run_times, iterations)
+    median              = Percentile.percentile(run_times, iterations, 50)
     mode                = Mode.mode(run_times)
     minimum             = Enum.min run_times
     maximum             = Enum.max run_times
@@ -189,21 +190,4 @@ defmodule Benchee.Statistics do
     variance = total_variance / iterations
     :math.sqrt variance
   end
-
-  defp compute_median(run_times, iterations) do
-    # this is rather inefficient, as O(log(n) * n + n) - there are
-    # O(n) algorithms to do compute this should it get to be a problem.
-    sorted = Enum.sort(run_times)
-    middle = div(iterations, 2)
-
-    if Integer.is_odd(iterations) do
-      sorted |> Enum.at(middle) |> to_float
-    else
-      (Enum.at(sorted, middle) + Enum.at(sorted, middle - 1)) / 2
-    end
-  end
-
-  defp to_float(maybe_integer) do
-    :erlang.float maybe_integer
-  end
 end
diff --git a/lib/benchee/statistics/percentile.ex b/lib/benchee/statistics/percentile.ex
@@ -0,0 +1,86 @@
+defmodule Benchee.Statistics.Percentile do
+  @moduledoc false
+
+  @doc """
+  Calculates the value at the `percentile_number`-th percentile. Think of this as the
+  value below which `percentile_number` percent of the samples lie. For example,
+  if `Benchee.Statistics.Percentile.percentile(samples, 99)` == 123.45,
+  99% of samples are less than 123.45.
+
+  ## Examples
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 8, 100)
+  5.0
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 8, 150)
+  5.0
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 8, 0)
+  1.0
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 8, -1)
+  1.0
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 50)
+  3.0
+
+  iex> Benchee.Statistics.Percentile.percentile([5, 3, 4, 5, 1, 3, 1, 3], 75)
+  4.75
+  """
+  @spec percentile(list(number()), integer()) :: float()
+  def percentile(samples, percentile_number) do
+    percentile(samples, length(samples), percentile_number)
+  end
+
+  @spec percentile(list(number()), integer(), integer()) :: float()
+  def percentile(samples, number_of_samples, percentile_number) when percentile_number > 100 do
+    percentile(samples, number_of_samples, 100)
+  end
+
+  def percentile(samples, number_of_samples, percentile_number) when percentile_number < 0 do
+    percentile(samples, number_of_samples, 0)
+  end
+
+  def percentile(samples, number_of_samples, percentile_number) do
+    sorted = Enum.sort(samples)
+    rank = (percentile_number / 100) * max(0, number_of_samples + 1)
+    percentile_value(sorted, rank)
+  end
+
+  defp percentile_value(sorted, rank) when trunc(rank) == 0 do
+    sorted
+    |> hd
+    |> to_float
+  end
+
+  defp percentile_value(sorted, rank) when trunc(rank) >= length(sorted) do
+    sorted
+    |> Enum.reverse
+    |> hd
+    |> to_float
+  end
+
+  defp percentile_value(sorted, rank) do
+    index = trunc(rank)
+    [lower_bound, upper_bound | _] = Enum.drop(sorted, index - 1)
+    interpolation_value = interpolation_value(lower_bound, upper_bound, rank)
+    lower_bound + interpolation_value
+  end
+
+  # "Type 6" interpolation strategy. There are many ways to interpolate a value
+  # when the rank is not an integer (in other words, we don't exactly land on a
+  # particular sample). Of the 9 main strategies, (types 1-9), types 6, 7, and 8
+  # are generally acceptable and give similar results.
+  #
+  # For more information on interpolation strategies, see:
+  # - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
+  # - http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+  defp interpolation_value(lower_bound, upper_bound, rank) do
+    interpolation_weight = rank - trunc(rank)
+    interpolation_weight * (upper_bound - lower_bound)
+  end
+
+  defp to_float(maybe_integer) do
+    :erlang.float maybe_integer
+  end
+end
diff --git a/test/benchee/statistics/percentile_test.exs b/test/benchee/statistics/percentile_test.exs
@@ -0,0 +1,26 @@
+defmodule Benchee.Statistics.PercentileTest do
+  use ExUnit.Case, async: true
+  alias Benchee.Statistics.Percentile
+  doctest Percentile
+
+  @nist_sample_data [
+    95.1772,
+    95.1567,
+    95.1937,
+    95.1959,
+    95.1442,
+    95.0610,
+    95.1591,
+    95.1195,
+    95.1065,
+    95.0925,
+    95.1990,
+    95.1682
+  ]
+
+  @tag run: true
+  test "90th percentile" do
+    result = Percentile.percentile(@nist_sample_data, 90)
+    assert Float.round(result, 4) == 95.1981
+  end
+end