Skip to content

Commit

Permalink
Merge branch 'improve-transform-performance' into 'develop_stream'
Browse files Browse the repository at this point in the history
tune device transform

Closes #698

See merge request amd/libraries/rocPRIM!621
  • Loading branch information
Naraenda committed Jul 18, 2024
2 parents b06da41 + 3124c77 commit de58f35
Show file tree
Hide file tree
Showing 10 changed files with 747 additions and 120 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Documentation for rocPRIM is available at

* Improved the performance of `warp_sort_shuffle` and `block_sort_bitonic`.
* Created an optimized version of the `warp_exchange` functions `blocked_to_striped_shuffle` and `striped_to_blocked_shuffle` when the warpsize is equal to the items per thread.
* Improved the performance of `device_transform`.

### Fixes

Expand Down
8 changes: 7 additions & 1 deletion benchmark/ConfigAutotuneSettings.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -93,5 +93,11 @@ KeyType;ValueType;BlockSize;ItemsPerThread;PartitionAllowed" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};int8_t;64;4 8 16;true false" PARENT_SCOPE)
set(output_pattern_suffix "\
@KeyType@_@ValueType@_@BlockSize@_@ItemsPerThread@_@PartitionAllowed@" PARENT_SCOPE)
elseif(file STREQUAL "benchmark_device_transform")
set(list_across_names "\
DataType;BlockSize;" PARENT_SCOPE)
set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE)
set(output_pattern_suffix "\
@DataType@_@BlockSize@" PARENT_SCOPE)
endif()
endfunction()
151 changes: 39 additions & 112 deletions benchmark/benchmark_device_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "benchmark_device_transform.parallel.hpp"
#include "benchmark_utils.hpp"

// CmdParser
#include "cmdparser.hpp"

Expand All @@ -45,104 +47,11 @@
const size_t DEFAULT_N = 1024 * 1024 * 128;
#endif

const unsigned int batch_size = 10;
const unsigned int warmup_size = 5;

template<class T>
struct transform
{
__device__ __host__
constexpr T operator()(const T& a) const
{
return a + T(5);
}
};

template<
class T,
class BinaryFunction
>
void run_benchmark(benchmark::State& state,
size_t size,
const hipStream_t stream,
BinaryFunction transform_op)
{
std::vector<T> input = get_random_data<T>(size, T(0), T(1000));

T * d_input;
T * d_output;
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
HIP_CHECK(
hipMemcpy(
d_input, input.data(),
size * sizeof(T),
hipMemcpyHostToDevice
)
);
HIP_CHECK(hipDeviceSynchronize());

// Warm-up
for(size_t i = 0; i < warmup_size; i++)
{
HIP_CHECK(
rocprim::transform(
d_input, d_output, size,
transform_op, stream
)
);
#define CREATE_BENCHMARK(T) \
{ \
const device_transform_benchmark<T> instance{}; \
REGISTER_BENCHMARK(benchmarks, size, stream, instance); \
}
HIP_CHECK(hipDeviceSynchronize());

// HIP events creation
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));

for(auto _ : state)
{
// Record start event
HIP_CHECK(hipEventRecord(start, stream));

for(size_t i = 0; i < batch_size; i++)
{
HIP_CHECK(
rocprim::transform(
d_input, d_output, size,
transform_op, stream
)
);
}

// Record stop event and wait until it completes
HIP_CHECK(hipEventRecord(stop, stream));
HIP_CHECK(hipEventSynchronize(stop));

float elapsed_mseconds;
HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
state.SetIterationTime(elapsed_mseconds / 1000);
}

// Destroy HIP events
HIP_CHECK(hipEventDestroy(start));
HIP_CHECK(hipEventDestroy(stop));

state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T));
state.SetItemsProcessed(state.iterations() * batch_size * size);

HIP_CHECK(hipFree(d_input));
HIP_CHECK(hipFree(d_output));
}

#define CREATE_BENCHMARK(T, TRANSFORM_OP) \
benchmark::RegisterBenchmark( \
bench_naming::format_name("{lvl:device,algo:transform,key_type:" #T \
",transform_op:" #TRANSFORM_OP ",cfg:default_config}") \
.c_str(), \
run_benchmark<T, TRANSFORM_OP>, \
size, \
stream, \
TRANSFORM_OP())

int main(int argc, char *argv[])
{
Expand All @@ -153,6 +62,17 @@ int main(int argc, char *argv[])
"name_format",
"human",
"either: json,human,txt");
#ifdef BENCHMARK_CONFIG_TUNING
// optionally run an evenly split subset of benchmarks, when making multiple program invocations
parser.set_optional<int>("parallel_instance",
"parallel_instance",
0,
"parallel instance index");
parser.set_optional<int>("parallel_instances",
"parallel_instances",
1,
"total parallel instances");
#endif // BENCHMARK_CONFIG_TUNING
parser.run_and_exit_if_error();

// Parse argv
Expand All @@ -168,25 +88,32 @@ int main(int argc, char *argv[])
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));

using custom_float2 = custom_type<float, float>;
using custom_double2 = custom_type<double, double>;

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks =
{
CREATE_BENCHMARK(int, transform<int>),
CREATE_BENCHMARK(long long, transform<long long>),
std::vector<benchmark::internal::Benchmark*> benchmarks = {};
#ifdef BENCHMARK_CONFIG_TUNING
const int parallel_instance = parser.get<int>("parallel_instance");
const int parallel_instances = parser.get<int>("parallel_instances");
config_autotune_register::register_benchmark_subset(benchmarks,
parallel_instance,
parallel_instances,
size,
stream);
#else // BENCHMARK_CONFIG_TUNING
using custom_float2 = custom_type<float, float>;
using custom_double2 = custom_type<double, double>;
CREATE_BENCHMARK(int)
CREATE_BENCHMARK(long long)

CREATE_BENCHMARK(int8_t, transform<int8_t>),
CREATE_BENCHMARK(uint8_t, transform<uint8_t>),
CREATE_BENCHMARK(rocprim::half, transform<rocprim::half>),
CREATE_BENCHMARK(int8_t)
CREATE_BENCHMARK(uint8_t)
CREATE_BENCHMARK(rocprim::half)

CREATE_BENCHMARK(float, transform<float>),
CREATE_BENCHMARK(double, transform<double>),
CREATE_BENCHMARK(float)
CREATE_BENCHMARK(double)

CREATE_BENCHMARK(custom_float2, transform<custom_float2>),
CREATE_BENCHMARK(custom_double2, transform<custom_double2>),
};
CREATE_BENCHMARK(custom_float2)
CREATE_BENCHMARK(custom_double2)
#endif // BENCHMARK_CONFIG_TUNING

// Use manual timing
for(auto& b : benchmarks)
Expand Down
33 changes: 33 additions & 0 deletions benchmark/benchmark_device_transform.parallel.cpp.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include <cstdint>

#include "benchmark_utils.hpp"
#include "benchmark_device_transform.parallel.hpp"

namespace {
auto benchmarks = config_autotune_register::create_bulk(
device_transform_benchmark_generator<
@DataType@,
@BlockSize@>::create);
}
Loading

0 comments on commit de58f35

Please sign in to comment.