Skip to content

Commit

Permalink
Performance improvements: use array of primary data-types instead of …
Browse files Browse the repository at this point in the history
…array of centroids ( struct )
  • Loading branch information
filipecosta90 committed Feb 21, 2021
1 parent ca8741c commit 7649bd9
Show file tree
Hide file tree
Showing 8 changed files with 653 additions and 230 deletions.
29 changes: 22 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,26 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
if(ENABLE_PROFILE)
message(STATUS "Enabling profile flags.")
string (REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
string (REPLACE "-O3" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
string (REPLACE "-O3" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
string (REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2 -g -ggdb -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -g -ggdb -fno-omit-frame-pointer")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -ggdb -fno-omit-frame-pointer")
# enable vectorization report flags
# using Clang
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Rpass-analysis=loop-vectorize -Rpass=loop-vectorize -Rpass-missed=loop-vectorize")

# using GCC
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ftree-vectorize -fopt-info-vec-all")

# using Intel C++
elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -qopt-report=5 -qopt-report-phase=vec")

# using Visual Studio C++
elseif (CMAKE_C_COMPILER_ID STREQUAL "MSVC")
# TBD
endif()
endif(ENABLE_PROFILE)

# --- System Libraries ---
Expand All @@ -57,7 +72,7 @@ include(UseCodeCoverage)
include(GNUInstallDirs)

# --- Build directories ---
add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/src")
add_subdirectory("src")

# --- Documentation ---
# TODO
Expand All @@ -66,10 +81,10 @@ add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/src")
ENABLE_TESTING()

if(BUILD_TESTS)
add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/tests")
add_subdirectory("tests")
endif(BUILD_TESTS)

# --- Examples ---
if(BUILD_EXAMPLES)
add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/examples")
add_subdirectory("examples")
endif(BUILD_EXAMPLES)
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ clean: distclean
distclean:
rm -rf build/*

profile: clean
( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 2> $(basename $@).compiler_stedrr_output.txt )

bench: clean
( mkdir -p build; cd build ; cmake $(CMAKE_PROFILE_OPTIONS) .. ; $(MAKE) VERBOSE=1 )
$(SHOW) build/tests/histogram_benchmark --benchmark_min_time=10
Expand Down
73 changes: 13 additions & 60 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ t-Digest over previous digests for this purpose is that the t-Digest
handles data with full floating point resolution. The accuracy of
quantile estimates produced by t-Digests can be orders of magnitude more
accurate than those produced by previous digest algorithms. Methods are
provided to create and update t-Digests and retreive quantiles from the
provided to create and update t-Digests and retrieve quantiles from the
accumulated distributions.

See [the original paper by Ted Dunning & Otmar
Expand All @@ -38,7 +38,7 @@ The following functions are implemented:

- `td_add`: Add a value to the t-Digest with the specified count
- `td_create`: Allocate a new histogram
- `td_reset`: Empty out a histogram and re-initialise it
- `td_reset`: Empty out a histogram and re-initialize it
- `td_free`: Frees the memory associated with the t-Digest
- `td_compress`: Re-examines a the t-Digest to determine whether some centroids are redundant
- `td_merge`: Merge one t-Digest into another
Expand Down Expand Up @@ -74,65 +74,8 @@ Assuming you've followed the previous build steps, it should be as easy as:
make bench
```

### Ingestion
### Real-time ops

#### master branch (~121ns/iter for compression 100)
```
tdigest$ make bench
build/tests/histogram_benchmark --benchmark_min_time=10
2020-02-13 18:59:18
Running build/tests/histogram_benchmark
Run on (8 X 3900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 256 KiB (x4)
L3 Unified 6144 KiB (x1)
Load Average: 1.73, 1.11, 0.88
------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
------------------------------------------------------------------------------------------------
BM_td_add_uniform_dist/100/10000000 1217132495 ns 1215202532 ns 11 Centroid_Count=76 Total_Compressions=205.048k items_per_second=748.098k/s
BM_td_add_uniform_dist/200/10000000 1338093787 ns 1338023019 ns 11 Centroid_Count=120 Total_Compressions=100.695k items_per_second=679.428k/s
BM_td_add_uniform_dist/300/10000000 1426454139 ns 1426372894 ns 10 Centroid_Count=171 Total_Compressions=60.85k items_per_second=701.079k/s
BM_td_add_uniform_dist/400/10000000 1489910255 ns 1489865942 ns 9 Centroid_Count=218 Total_Compressions=40.927k items_per_second=745.779k/s
BM_td_add_uniform_dist/500/10000000 1541433574 ns 1541231001 ns 9 Centroid_Count=253 Total_Compressions=32.621k items_per_second=720.924k/s
BM_td_add_lognormal_dist/100/10000000 1279235916 ns 1279181475 ns 11 Centroid_Count=75 Total_Compressions=204.968k items_per_second=710.682k/s
BM_td_add_lognormal_dist/200/10000000 1396900530 ns 1396842464 ns 10 Centroid_Count=124 Total_Compressions=91.795k items_per_second=715.9k/s
BM_td_add_lognormal_dist/300/10000000 1432398499 ns 1423146494 ns 10 Centroid_Count=167 Total_Compressions=60.787k items_per_second=702.668k/s
BM_td_add_lognormal_dist/400/10000000 1523148779 ns 1507221068 ns 9 Centroid_Count=207 Total_Compressions=40.828k items_per_second=737.192k/s
BM_td_add_lognormal_dist/500/10000000 1551385694 ns 1551317295 ns 9 Centroid_Count=259 Total_Compressions=32.652k items_per_second=716.237k/s
```

#### [perf.improvements](https://github.com/filipecosta90/tdigest/tree/perf.improvements) branch (~71ns/iter for compression 100)
Making usage of naive quick sort with array of primary data-types instead of array of centroids ( struct ). Naive since it starts always at the beginning and the initial centroids are already sorted.
```
2020-02-14 14:51:03
Running build/tests/histogram_benchmark
Run on (8 X 3900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 256 KiB (x4)
L3 Unified 6144 KiB (x1)
Load Average: 1.50, 1.02, 0.96
------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
------------------------------------------------------------------------------------------------
BM_td_add_uniform_dist/100/10000000 718939047 ns 718903714 ns 20 Centroid_Count=69 Total_Compressions=370.58k items_per_second=695.503k/s
BM_td_add_uniform_dist/200/10000000 804681013 ns 804650949 ns 17 Centroid_Count=114 Total_Compressions=155.468k items_per_second=731.044k/s
BM_td_add_uniform_dist/300/10000000 891044330 ns 891005893 ns 16 Centroid_Count=163 Total_Compressions=97.161k items_per_second=701.454k/s
BM_td_add_uniform_dist/400/10000000 1033516962 ns 1033465854 ns 14 Centroid_Count=203 Total_Compressions=63.638k items_per_second=691.156k/s
BM_td_add_uniform_dist/500/10000000 1095685997 ns 1095635532 ns 13 Centroid_Count=251 Total_Compressions=47.155k items_per_second=702.086k/s
BM_td_add_lognormal_dist/100/10000000 736637497 ns 735809155 ns 18 Centroid_Count=66 Total_Compressions=332.592k items_per_second=755.027k/s
BM_td_add_lognormal_dist/200/10000000 807975314 ns 807936447 ns 17 Centroid_Count=114 Total_Compressions=155.149k items_per_second=728.071k/s
BM_td_add_lognormal_dist/300/10000000 900674982 ns 900632289 ns 16 Centroid_Count=160 Total_Compressions=97.167k items_per_second=693.957k/s
BM_td_add_lognormal_dist/400/10000000 964961965 ns 964920578 ns 15 Centroid_Count=207 Total_Compressions=68.071k items_per_second=690.903k/s
BM_td_add_lognormal_dist/500/10000000 1078639324 ns 1078597549 ns 14 Centroid_Count=249 Total_Compressions=50.766k items_per_second=662.236k/s
```

#### [perf.qsort.central](https://github.com/filipecosta90/tdigest/tree/perf.qsort.central) branch (~60ns/iter for compression 100)
Making usage of quick sort but with central pivot
```
build/tests/histogram_benchmark --benchmark_min_time=10
2020-02-14 15:01:48
Expand All @@ -157,6 +100,16 @@ BM_td_add_lognormal_dist/200/10000000 659716761 ns 659695213 ns 21
BM_td_add_lognormal_dist/300/10000000 686360957 ns 686338645 ns 20 Centroid_Count=158 Total_Compressions=121.153k items_per_second=728.503k/s
BM_td_add_lognormal_dist/400/10000000 710243244 ns 710176968 ns 20 Centroid_Count=202 Total_Compressions=90.684k items_per_second=704.05k/s
BM_td_add_lognormal_dist/500/10000000 727330010 ns 727310898 ns 19 Centroid_Count=247 Total_Compressions=68.802k items_per_second=723.646k/s
BM_td_quantile_lognormal_dist/100/10000000 700255552 ns 700148359 ns 22 items_per_second=649.213k/s
BM_td_quantile_lognormal_dist/200/10000000 1061306502 ns 1060591740 ns 10 items_per_second=942.87k/s
BM_td_quantile_lognormal_dist/300/10000000 1324175646 ns 1323378599 ns 10 items_per_second=755.642k/s
BM_td_quantile_lognormal_dist/400/10000000 1561453826 ns 1559886721 ns 9 items_per_second=712.302k/s
BM_td_quantile_lognormal_dist/500/10000000 1926473858 ns 1925626638 ns 7 items_per_second=741.874k/s
BM_td_merge_lognormal_dist/100/100000 221651863 ns 221600446 ns 77 items_per_second=5.86055k/s
BM_td_merge_lognormal_dist/200/100000 375880655 ns 375754298 ns 32 items_per_second=8.3166k/s
BM_td_merge_lognormal_dist/300/100000 512695687 ns 512352015 ns 26 items_per_second=7.50686k/s
BM_td_merge_lognormal_dist/400/100000 725529230 ns 725268818 ns 19 items_per_second=7.25684k/s
BM_td_merge_lognormal_dist/500/100000 1015191078 ns 1014835789 ns 12 items_per_second=8.21151k/s
```
## Code of Conduct

Expand Down
21 changes: 21 additions & 0 deletions src/td_malloc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
* The implementation is a direct descendent of MergingDigest
* https://github.com/tdunning/t-digest/
*
* Copyright (c) 2021 Redis Labs, All rights reserved.
*
* Allocator selection.
*
* This file is used in order to change the t-digest allocator at compile time.
* Just define the following defines to what you want to use. Also add
* the include of your alternate allocator if needed (not needed in order
* to use the default libc allocator). */

#ifndef TD_ALLOC_H
#define TD_ALLOC_H
#define __td_malloc malloc
#define __td_calloc calloc
#define __td_realloc realloc
#define __td_free free
#endif
Loading

0 comments on commit 7649bd9

Please sign in to comment.