From 6495aaef0bc10a4f05ef0445cb2f92194e0e336b Mon Sep 17 00:00:00 2001 From: rbresearch Date: Mon, 7 Jan 2013 22:53:10 -0600 Subject: [PATCH] revisions to run_sum-performance --- _posts/2013-01-06-run_sum-benchmark.md | 146 ++++++++++++------------- src/2013-01-06-run_sum-benchmark.cpp | 112 +++++++++---------- 2 files changed, 118 insertions(+), 140 deletions(-) diff --git a/_posts/2013-01-06-run_sum-benchmark.md b/_posts/2013-01-06-run_sum-benchmark.md index d381c4f..ad2298f 100644 --- a/_posts/2013-01-06-run_sum-benchmark.md +++ b/_posts/2013-01-06-run_sum-benchmark.md @@ -22,13 +22,9 @@ the language. {% highlight r %} run_sum_R <- function(x, n) { - # x : input vector - # n : size of window - # size of input vector sz <- length(x) - # initialize the output vector ov <- vector(mode = "numeric", length = sz) # sum the values from the beginning of the vector to n @@ -41,6 +37,7 @@ run_sum_R <- function(x, n) { # pad the first n-1 values with NA ov[1:(n-1)] <- NA + return(ov) } @@ -48,110 +45,91 @@ suppressMessages(library(TTR)) library(rbenchmark) set.seed(123) -x <- rnorm(100000) - -stopifnot(all.equal(run_sum_R(x, 500), runSum(x, 500))) +x <- rnorm(10000) # benchmark run_sum_R for given values of x and n -benchmark(run_sum_R(x, 500), run_sum_R(x, 2500), - run_sum_R(x, 4500), run_sum_R(x, 6500), +benchmark( run_sum_R(x, 50), run_sum_R(x, 100), + run_sum_R(x, 150), run_sum_R(x, 200), order = NULL)[,1:4] {% endhighlight %}
-                test replications elapsed relative
-1  run_sum_R(x, 500)          100   49.53    1.000
-2 run_sum_R(x, 2500)          100   52.99    1.070
-3 run_sum_R(x, 4500)          100   61.07    1.233
-4 run_sum_R(x, 6500)          100   65.01    1.313
+               test replications elapsed relative
+1  run_sum_R(x, 50)          100   3.364    1.007
+2 run_sum_R(x, 100)          100   3.339    1.000
+3 run_sum_R(x, 150)          100   3.390    1.015
+4 run_sum_R(x, 200)          100   3.590    1.075
 
+For these benchmarks, I will just focus on the performance of the functions +for a fixed `x` and varying the value of `n`. The results of the benchmark +of `run_sum_R` show that the elapsed time is fairly constant for the +given values of `n` (i.e. O(1)). + Now let us consider a running sum function in C++, call it `run_sum_v1`. One approach is to loop through each element of the given vector -calling std::accumulate to compute the running sum over the window. +calling std::accumulate to compute the running sum. {% highlight cpp %} #include -#include // for accumulate -#include using namespace Rcpp; // [[Rcpp::export]] -std::vector run_sum_v1(std::vector x, int n) { - // x : input vector - // n : size of window +NumericVector run_sum_v1(NumericVector x, int n) { - // size of the input vector int sz = x.size(); + + NumericVector res(sz); - // initialize res vector - std::vector res(sz); - - // loop through the vector calling std::accumulate to - // compute the running sum + // loop through the vector calling std::accumulate for(int i = 0; i < (sz-n+1); i++){ res[i+n-1] = std::accumulate(x.begin()+i, x.end()-sz+n+i, 0.0); } // pad the first n-1 elements with NA std::fill(res.begin(), res.end()-sz+n-1, NA_REAL); + return res; } {% endhighlight %} {% highlight r %} -stopifnot(all.equal(run_sum_v1(x, 500), runSum(x, 500))) - # benchmark run_sum_v1 for given values of x and n -benchmark(run_sum_v1(x, 500), run_sum_v1(x, 2500), - run_sum_v1(x, 4500), run_sum_v1(x, 6500), +benchmark( run_sum_v1(x, 50), run_sum_v1(x, 100), + run_sum_v1(x, 150), run_sum_v1(x, 200), order = NULL)[,1:4] {% endhighlight %}
-                 test replications elapsed relative
-1  run_sum_v1(x, 500)          100   7.635    1.000
-2 run_sum_v1(x, 2500)          100  39.827    5.216
-3 run_sum_v1(x, 4500)          100  75.232    9.854
-4 run_sum_v1(x, 6500)          100 147.537   19.324
+                test replications elapsed relative
+1  run_sum_v1(x, 50)          100   0.045    1.000
+2 run_sum_v1(x, 100)          100   0.088    1.956
+3 run_sum_v1(x, 150)          100   0.128    2.844
+4 run_sum_v1(x, 200)          100   0.170    3.778
 
-The benchmark results of `run_sum_v1` are not very impressive. The -time increases fairly linearly as `n` increases. -This is due to having `std::accumulate` inside the for loop. -For a vector of size 100,000 and `n = 5000`, `std::accumulate` is -called 95,001 times. - -An interesting result is that `run_sum_R` is faster than `run_sum_v1` for -`n=4500` and `n=6500` of the benchmark. This example demonstrates that it -is not always the case that C++ code is faster than R code. - -This is obviously not an "apples-to-apples" comparison because a different -algorithm is used, but the point of the example is to demonstrate the -importance of the algorithm regardless of the programming language. +Although the elapsed times of `run_sum_v1` are quite fast, note that the +time increases approximately linearly as `n` increases (i.e. O(N)). This +will become a problem if we use this function with large values of `n`. Now let us write another running sum function in C++ that uses the same algorithm that is used in `run_sum_R`, call it `run_sum_v2`. {% highlight cpp %} // [[Rcpp::export]] -std::vector run_sum_v2(std::vector x, int n) { - // x : input vector - // n : size of window +NumericVector run_sum_v2(NumericVector x, int n) { - // size of input vector int sz = x.size(); - // initialize res vector - std::vector res(sz); + NumericVector res(sz); // sum the values from the beginning of the vector to n res[n-1] = std::accumulate(x.begin(), x.end()-sz+n, 0.0); @@ -163,59 +141,71 @@ std::vector run_sum_v2(std::vector x, int n) { // pad the first n-1 elements with NA std::fill(res.begin(), res.end()-sz+n-1, NA_REAL); + return res; } {% endhighlight %} {% highlight r %} -stopifnot(all.equal(run_sum_v2(x, 500), runSum(x, 500))) - # benchmark run_sum_v2 for given values of x and n -benchmark(run_sum_v2(x, 500), run_sum_v2(x, 2500), - run_sum_v2(x, 4500), run_sum_v2(x, 6500), +benchmark( run_sum_v2(x, 50), run_sum_v2(x, 100), + run_sum_v2(x, 150), run_sum_v2(x, 200), order = NULL)[,1:4] {% endhighlight %}
-                 test replications elapsed relative
-1  run_sum_v2(x, 500)          100   0.183    1.052
-2 run_sum_v2(x, 2500)          100   0.182    1.046
-3 run_sum_v2(x, 4500)          100   0.174    1.000
-4 run_sum_v2(x, 6500)          100   0.224    1.287
+                test replications elapsed relative
+1  run_sum_v2(x, 50)          100   0.007        1
+2 run_sum_v2(x, 100)          100   0.007        1
+3 run_sum_v2(x, 150)          100   0.007        1
+4 run_sum_v2(x, 200)          100   0.007        1
 
-The benchmark results of `run_sum_v2` are relatively fast and much more +The benchmark results of `run_sum_v2` are quite fast and much more favorable than both `run_sum_R` and `run_sum_v1`. The elapsed time is -about a tenth of a second and is fairly constant across the given -values of `n`. +approximately constant across the given values of `n` (i.e O(N)). -Finally, let us benchmark `runSum` from the TTR package. +Finally, let us benchmark all three functions as well as `runSum` from +the TTR package for a point of reference using larger values for the +size of `x` and `n`. {% highlight r %} +set.seed(42) +y <- rnorm(100000) + # benchmark runSum for given values of x and n -benchmark(runSum(x, 500), runSum(x, 2500), - runSum(x, 4500), runSum(x, 6500), - order = NULL)[,1:4] +benchmark( runSum(y, 4500), run_sum_v1(y, 4500), + run_sum_v2(y, 4500), run_sum_R(y, 4500), + order = "relative")[,1:4] {% endhighlight %}
-             test replications elapsed relative
-1  runSum(x, 500)          100   2.080    1.094
-2 runSum(x, 2500)          100   1.976    1.039
-3 runSum(x, 4500)          100   1.902    1.000
-4 runSum(x, 6500)          100   1.970    1.036
+                 test replications elapsed relative
+3 run_sum_v2(y, 4500)          100   0.082     1.00
+1     runSum(y, 4500)          100   0.889    10.84
+4  run_sum_R(y, 4500)          100  33.717   411.18
+2 run_sum_v1(y, 4500)          100  37.538   457.78
 
-The benchmark results of `runSum` are also quite good. The elapsed time is -about a seven tenths of a second and is fairly constant across the -given values of `n`. It should be noted that `runSum` does some extra +An interesting result of benchmarking with these larger values is +that `run_sum_R` is faster than `run_sum_v1` for the given values. +This example demonstrates that it is not always the case that C++ code +is faster than R code. The inefficiency of `run_sum_v1` is due to having +`std::accumulate` inside the for loop. For a vector of size 100,000 and +`n = 5000`, `std::accumulate` is called 95,001 times! + +This is obviously not an "apples-to-apples" comparison because a different +algorithm is used, but the point of the example is to demonstrate the +importance of the algorithm regardless of the programming language. + +It should be noted that `runSum` does some extra work in R such as checking for a valid `n`, non-leading NAs, etc. and should be considered when comparing the benchmark results of `run_sum_v2` to `runSum`. diff --git a/src/2013-01-06-run_sum-benchmark.cpp b/src/2013-01-06-run_sum-benchmark.cpp index e666da7..14201d8 100644 --- a/src/2013-01-06-run_sum-benchmark.cpp +++ b/src/2013-01-06-run_sum-benchmark.cpp @@ -20,13 +20,9 @@ /*** R run_sum_R <- function(x, n) { - # x : input vector - # n : size of window - # size of input vector sz <- length(x) - # initialize the output vector ov <- vector(mode = "numeric", length = sz) # sum the values from the beginning of the vector to n @@ -39,6 +35,7 @@ run_sum_R <- function(x, n) { # pad the first n-1 values with NA ov[1:(n-1)] <- NA + return(ov) } @@ -46,92 +43,71 @@ suppressMessages(library(TTR)) library(rbenchmark) set.seed(123) -x <- rnorm(100000) - -stopifnot(all.equal(run_sum_R(x, 500), runSum(x, 500))) +x <- rnorm(10000) # benchmark run_sum_R for given values of x and n -benchmark(run_sum_R(x, 500), run_sum_R(x, 2500), - run_sum_R(x, 4500), run_sum_R(x, 6500), +benchmark( run_sum_R(x, 50), run_sum_R(x, 100), + run_sum_R(x, 150), run_sum_R(x, 200), order = NULL)[,1:4] */ /** + * For these benchmarks, I will just focus on the performance of the functions + * for a fixed `x` and varying the value of `n`. The results of the benchmark + * of `run_sum_R` show that the elapsed time is fairly constant for the + * given values of `n` (i.e. O(1)). + * * Now let us consider a running sum function in C++, call it `run_sum_v1`. * One approach is to loop through each element of the given vector - * calling std::accumulate to compute the running sum over the window. + * calling std::accumulate to compute the running sum. */ #include -#include // for accumulate -#include using namespace Rcpp; // [[Rcpp::export]] -std::vector run_sum_v1(std::vector x, int n) { - // x : input vector - // n : size of window +NumericVector run_sum_v1(NumericVector x, int n) { - // size of the input vector int sz = x.size(); + + NumericVector res(sz); - // initialize res vector - std::vector res(sz); - - // loop through the vector calling std::accumulate to - // compute the running sum + // loop through the vector calling std::accumulate for(int i = 0; i < (sz-n+1); i++){ res[i+n-1] = std::accumulate(x.begin()+i, x.end()-sz+n+i, 0.0); } // pad the first n-1 elements with NA std::fill(res.begin(), res.end()-sz+n-1, NA_REAL); + return res; } /*** R -stopifnot(all.equal(run_sum_v1(x, 500), runSum(x, 500))) - # benchmark run_sum_v1 for given values of x and n -benchmark(run_sum_v1(x, 500), run_sum_v1(x, 2500), - run_sum_v1(x, 4500), run_sum_v1(x, 6500), +benchmark( run_sum_v1(x, 50), run_sum_v1(x, 100), + run_sum_v1(x, 150), run_sum_v1(x, 200), order = NULL)[,1:4] */ /** - * The benchmark results of `run_sum_v1` are not very impressive. The - * time increases fairly linearly as `n` increases. - * This is due to having `std::accumulate` inside the for loop. - * For a vector of size 100,000 and `n = 5000`, `std::accumulate` is - * called 95,001 times. - * - * An interesting result is that `run_sum_R` is faster than `run_sum_v1` for - * `n=4500` and `n=6500` of the benchmark. This example demonstrates that it - * is not always the case that C++ code is faster than R code. - * - * This is obviously not an "apples-to-apples" comparison because a different - * algorithm is used, but the point of the example is to demonstrate the - * importance of the algorithm regardless of the programming language. - */ - -/** + * Although the elapsed times of `run_sum_v1` are quite fast, note that the + * time increases approximately linearly as `n` increases (i.e. O(N)). This + * will become a problem if we use this function with large values of `n`. + * * Now let us write another running sum function in C++ that uses * the same algorithm that is used in `run_sum_R`, call it `run_sum_v2`. */ // [[Rcpp::export]] -std::vector run_sum_v2(std::vector x, int n) { - // x : input vector - // n : size of window +NumericVector run_sum_v2(NumericVector x, int n) { - // size of input vector int sz = x.size(); - // initialize res vector - std::vector res(sz); + NumericVector res(sz); // sum the values from the beginning of the vector to n res[n-1] = std::accumulate(x.begin(), x.end()-sz+n, 0.0); @@ -143,40 +119,52 @@ std::vector run_sum_v2(std::vector x, int n) { // pad the first n-1 elements with NA std::fill(res.begin(), res.end()-sz+n-1, NA_REAL); + return res; } /*** R -stopifnot(all.equal(run_sum_v2(x, 500), runSum(x, 500))) - # benchmark run_sum_v2 for given values of x and n -benchmark(run_sum_v2(x, 500), run_sum_v2(x, 2500), - run_sum_v2(x, 4500), run_sum_v2(x, 6500), +benchmark( run_sum_v2(x, 50), run_sum_v2(x, 100), + run_sum_v2(x, 150), run_sum_v2(x, 200), order = NULL)[,1:4] */ /** - * The benchmark results of `run_sum_v2` are relatively fast and much more + * The benchmark results of `run_sum_v2` are quite fast and much more * favorable than both `run_sum_R` and `run_sum_v1`. The elapsed time is - * about a tenth of a second and is fairly constant across the given - * values of `n`. + * approximately constant across the given values of `n` (i.e O(N)). * - * Finally, let us benchmark `runSum` from the TTR package. + * Finally, let us benchmark all three functions as well as `runSum` from + * the TTR package for a point of reference using larger values for the + * size of `x` and `n`. */ /*** R +set.seed(42) +y <- rnorm(100000) + # benchmark runSum for given values of x and n -benchmark(runSum(x, 500), runSum(x, 2500), - runSum(x, 4500), runSum(x, 6500), - order = NULL)[,1:4] +benchmark( runSum(y, 4500), run_sum_v1(y, 4500), + run_sum_v2(y, 4500), run_sum_R(y, 4500), + order = "relative")[,1:4] */ -/** - * The benchmark results of `runSum` are also quite good. The elapsed time is - * about a seven tenths of a second and is fairly constant across the - * given values of `n`. It should be noted that `runSum` does some extra + /** + * An interesting result of benchmarking with these larger values is + * that `run_sum_R` is faster than `run_sum_v1` for the given values. + * This example demonstrates that it is not always the case that C++ code + * is faster than R code. The inefficiency of `run_sum_v1` is due to having + * `std::accumulate` inside the for loop. For a vector of size 100,000 and + * `n = 5000`, `std::accumulate` is called 95,001 times! + * + * This is obviously not an "apples-to-apples" comparison because a different + * algorithm is used, but the point of the example is to demonstrate the + * importance of the algorithm regardless of the programming language. + * + * It should be noted that `runSum` does some extra * work in R such as checking for a valid `n`, non-leading NAs, etc. * and should be considered when comparing the benchmark results of * `run_sum_v2` to `runSum`.