Skip to content

Commit

Permalink
Merge branch '737-deterministic-scan-and-reduce-by-key-cccl-2-4-prepa…
Browse files Browse the repository at this point in the history
…ration' into 'develop_stream'

Resolve "Deterministic scan and reduce by key (CCCL 2.4 preparation)"

Closes #737

See merge request amd/libraries/rocPRIM!691
  • Loading branch information
Snektron authored and Naraenda committed Jul 18, 2024
2 parents 9c7c297 + c6e16ea commit 4a26848
Show file tree
Hide file tree
Showing 29 changed files with 2,210 additions and 527 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Documentation for rocPRIM is available at
* Added `rocprim::uninitialized_array` which provides uninitialized storage in local memory for user-defined types.
* Added large segment support for `rocprim:segmented_reduce`.
* Added a parallel `nth_element` device function similar to `std::nth_element`, this function rearranges elements smaller than the n-th before and bigger than the n-th after the n-th element.
* Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance.

### Changes

Expand Down Expand Up @@ -53,10 +54,10 @@ Documentation for rocPRIM is available at
* New `rocprim::batch_copy` function added. Similar to `rocprim::batch_memcpy`, but copies by element, not with memcpy.
* Added more test cases, to better cover supported data types.
* Updated some tests to work with supported data types.
* An optional `decomposer` argument for all member functions of `rocprim::block_radix_sort` and all functions of `device_radix_sort`.
* An optional `decomposer` argument for all member functions of `rocprim::block_radix_sort` and all functions of `device_radix_sort`.
To sort keys of an user-defined type, a decomposer functor should be passed. The decomposer should produce a `rocprim::tuple`
of references to arithmetic types from the key.
* New `rocprim::predicate_iterator` which acts as a proxy for an underlying iterator based on a predicate.
* New `rocprim::predicate_iterator` which acts as a proxy for an underlying iterator based on a predicate.
It iterates over proxies that holds the references to the underlying values, but only allow reading and writing if the predicate is `true`.
It can be instantiated with:
* `rocprim::make_predicate_iterator`
Expand Down
3 changes: 3 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,13 @@ add_rocprim_benchmark(benchmark_device_radix_sort.cpp)
add_rocprim_benchmark(benchmark_device_radix_sort_block_sort.cpp)
add_rocprim_benchmark(benchmark_device_radix_sort_onesweep.cpp)
add_rocprim_benchmark(benchmark_device_reduce_by_key.cpp)
add_rocprim_benchmark(benchmark_device_reduce_by_key_deterministic.cpp)
add_rocprim_benchmark(benchmark_device_reduce.cpp)
add_rocprim_benchmark(benchmark_device_run_length_encode.cpp)
add_rocprim_benchmark(benchmark_device_scan.cpp)
add_rocprim_benchmark(benchmark_device_scan_deterministic.cpp)
add_rocprim_benchmark(benchmark_device_scan_by_key.cpp)
add_rocprim_benchmark(benchmark_device_scan_by_key_deterministic.cpp)
add_rocprim_benchmark(benchmark_device_select.cpp)
add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp)
add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp)
Expand Down
47 changes: 33 additions & 14 deletions benchmark/benchmark_device_reduce_by_key.parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ inline std::string config_name<rocprim::default_config>()

template<typename KeyType,
typename ValueType,
int MaxSegmentLength,
typename Config = rocprim::default_config>
int MaxSegmentLength,
bool Deterministic = false,
typename Config = rocprim::default_config>
struct device_reduce_by_key_benchmark : public config_autotune_interface
{
std::string name() const override
Expand Down Expand Up @@ -130,17 +131,34 @@ struct device_reduce_by_key_benchmark : public config_autotune_interface
{
const auto dispatch_input = [&](KeyType* d_key_input)
{
HIP_CHECK(rocprim::reduce_by_key<Config>(d_temp_storage,
temp_storage_size_bytes,
d_key_input,
d_value_input,
size,
d_unique_output,
d_aggregates_output,
d_unique_count_output,
reduce_op,
key_compare_op,
stream));
if ROCPRIM_IF_CONSTEXPR(!Deterministic)
{
HIP_CHECK(rocprim::reduce_by_key<Config>(d_temp_storage,
temp_storage_size_bytes,
d_key_input,
d_value_input,
size,
d_unique_output,
d_aggregates_output,
d_unique_count_output,
reduce_op,
key_compare_op,
stream));
}
else
{
HIP_CHECK(rocprim::deterministic_reduce_by_key<Config>(d_temp_storage,
temp_storage_size_bytes,
d_key_input,
d_value_input,
size,
d_unique_output,
d_aggregates_output,
d_unique_count_output,
reduce_op,
key_compare_op,
stream));
}
};

// One tuning iteration runs multiple inputs with different distributions,
Expand Down Expand Up @@ -223,7 +241,8 @@ struct device_reduce_by_key_benchmark_generator
TilesPerBlock>;
// max segment length argument is irrelevant, tuning overrides segment length
storage.emplace_back(
std::make_unique<device_reduce_by_key_benchmark<KeyType, ValueType, 0, config>>());
std::make_unique<
device_reduce_by_key_benchmark<KeyType, ValueType, 0, false, config>>());
}
};

Expand Down
138 changes: 138 additions & 0 deletions benchmark/benchmark_device_reduce_by_key_deterministic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// MIT License
//
// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "benchmark_device_reduce_by_key.parallel.hpp"
#include "benchmark_utils.hpp"
// CmdParser
#include "cmdparser.hpp"

// Google Benchmark
#include <benchmark/benchmark.h>

// HIP API
#include <hip/hip_runtime.h>

#include <cstddef>
#include <string>
#include <vector>

#ifndef DEFAULT_BYTES
constexpr size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB
#endif

#define CREATE_BENCHMARK(KEY, VALUE, MAX_SEGMENT_LENGTH) \
{ \
const device_reduce_by_key_benchmark<KEY, VALUE, MAX_SEGMENT_LENGTH, true> instance; \
REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \
}

#define CREATE_BENCHMARK_TYPE(KEY, VALUE) \
CREATE_BENCHMARK(KEY, VALUE, 10); \
CREATE_BENCHMARK(KEY, VALUE, 1000)

// some of the tuned types
#define CREATE_BENCHMARK_TYPES(KEY) \
CREATE_BENCHMARK_TYPE(KEY, int8_t); \
CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \
CREATE_BENCHMARK_TYPE(KEY, int32_t); \
CREATE_BENCHMARK_TYPE(KEY, float); \
CREATE_BENCHMARK_TYPE(KEY, double)

// all of the tuned types
#define CREATE_BENCHMARK_TYPE_TUNING(KEY) \
CREATE_BENCHMARK_TYPE(KEY, int8_t); \
CREATE_BENCHMARK_TYPE(KEY, int16_t); \
CREATE_BENCHMARK_TYPE(KEY, int32_t); \
CREATE_BENCHMARK_TYPE(KEY, int64_t); \
CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \
CREATE_BENCHMARK_TYPE(KEY, float); \
CREATE_BENCHMARK_TYPE(KEY, double)

int main(int argc, char* argv[])
{
cli::Parser parser(argc, argv);
parser.set_optional<size_t>("size", "size", DEFAULT_BYTES, "number of bytes");
parser.set_optional<int>("trials", "trials", -1, "number of iterations");
parser.set_optional<std::string>("name_format",
"name_format",
"human",
"either: json,human,txt");
parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
parser.run_and_exit_if_error();

// Parse argv
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
const std::string seed_type = parser.get<std::string>("seed");
const managed_seed seed(seed_type);

// HIP
hipStream_t stream = 0; // default

// Benchmark info
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));
benchmark::AddCustomContext("seed", seed_type);

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks = {};
// tuned types
CREATE_BENCHMARK_TYPES(int8_t);
CREATE_BENCHMARK_TYPES(int16_t);
CREATE_BENCHMARK_TYPE_TUNING(int32_t);
CREATE_BENCHMARK_TYPE_TUNING(int64_t);
CREATE_BENCHMARK_TYPES(rocprim::half);
CREATE_BENCHMARK_TYPES(float);
CREATE_BENCHMARK_TYPES(double);

// custom types
using custom_float2 = custom_type<float, float>;
using custom_double2 = custom_type<double, double>;

CREATE_BENCHMARK_TYPE(int, custom_float2);
CREATE_BENCHMARK_TYPE(int, custom_double2);

CREATE_BENCHMARK_TYPE(long long, custom_float2);
CREATE_BENCHMARK_TYPE(long long, custom_double2);

// Use manual timing
for(auto& b : benchmarks)
{
b->UseManualTime();
b->Unit(benchmark::kMillisecond);
}

// Force number of iterations
if(trials > 0)
{
for(auto& b : benchmarks)
{
b->Iterations(trials);
}
}

// Run benchmarks
benchmark::RunSpecifiedBenchmarks();
return 0;
}
73 changes: 52 additions & 21 deletions benchmark/benchmark_device_scan.parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ inline std::string config_name<rocprim::default_config>()
return "default_config";
}

template<bool Exclusive = false,
class T = int,
class ScanOp = rocprim::plus<T>,
class Config = rocprim::default_config>
template<bool Exclusive = false,
class T = int,
class ScanOp = rocprim::plus<T>,
bool Deterministic = false,
class Config = rocprim::default_config>
struct device_scan_benchmark : public config_autotune_interface
{
std::string name() const override
Expand All @@ -81,15 +82,30 @@ struct device_scan_benchmark : public config_autotune_interface
const bool debug = false) const ->
typename std::enable_if<excl, hipError_t>::type
{
return rocprim::exclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
initial_value,
input_size,
scan_op,
stream,
debug);
if ROCPRIM_IF_CONSTEXPR(!Deterministic)
{
return rocprim::exclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
initial_value,
input_size,
scan_op,
stream,
debug);
}
else
{
return rocprim::deterministic_exclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
initial_value,
input_size,
scan_op,
stream,
debug);
}
}

template<bool excl = Exclusive>
Expand All @@ -105,14 +121,28 @@ struct device_scan_benchmark : public config_autotune_interface
typename std::enable_if<!excl, hipError_t>::type
{
(void)initial_value;
return rocprim::inclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
input_size,
scan_op,
stream,
debug);
if ROCPRIM_IF_CONSTEXPR(!Deterministic)
{
return rocprim::inclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
input_size,
scan_op,
stream,
debug);
}
else
{
return rocprim::deterministic_inclusive_scan<Config>(temporary_storage,
storage_size,
input,
output,
input_size,
scan_op,
stream,
debug);
}
}

void run(benchmark::State& state,
Expand Down Expand Up @@ -227,6 +257,7 @@ struct device_scan_benchmark_generator
false,
T,
rocprim::plus<T>,
false,
rocprim::scan_config<block_size,
ItemsPerThread,
rocprim::block_load_method::block_load_transpose,
Expand Down
Loading

0 comments on commit 4a26848

Please sign in to comment.