Skip to content

Commit

Permalink
Merge branch 'benchmark-seed' into 'develop_stream'
Browse files Browse the repository at this point in the history
Specify benchmark seed via command line

Closes #495

See merge request amd/libraries/rocPRIM!657
  • Loading branch information
nolmoonen authored and Naraenda committed Jul 18, 2024
2 parents 9e849db + 0717163 commit 9148c77
Show file tree
Hide file tree
Showing 53 changed files with 1,022 additions and 585 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ autotune:execute-tuning:
--benchmark_filter_regex="${AUTOTUNE_ALGORITHM_REGEX}"
--size="${AUTOTUNE_SIZE}"
--trials="${AUTOTUNE_TRIALS}"
--seed=82589933

autotune:generate-config:
image: python:3.10.5-buster
Expand Down
13 changes: 10 additions & 3 deletions .gitlab/run_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -28,7 +28,7 @@
import subprocess
import sys

BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials'])
BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed'])

def run_benchmarks(benchmark_context):
def is_benchmark_executable(filename):
Expand Down Expand Up @@ -61,6 +61,8 @@ def is_benchmark_executable(filename):
args += ['--size', benchmark_context.size]
if benchmark_context.trials:
args += ['--trials', benchmark_context.trials]
if benchmark_context.seed:
args += ['--seed', benchmark_context.seed]
try:
subprocess.check_call(args)
except subprocess.CalledProcessError as error:
Expand Down Expand Up @@ -97,6 +99,10 @@ def main():
help='Controls the number of trial iterations for each benchmark case',
default='',
required=False)
parser.add_argument('--seed',
help='Controls the seed for random number generation for each benchmark case',
default='',
required=False)

args = parser.parse_args()

Expand All @@ -107,7 +113,8 @@ def main():
args.benchmark_filename_regex,
args.benchmark_filter_regex,
args.size,
args.trials)
args.trials,
args.seed)

benchmark_run_successful = run_benchmarks(benchmark_context)

Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@ Documentation for rocPRIM is available at

### Changes

### Added

* Option `--seed` to benchmarks to specify a seed for the generation of random inputs. The default behavior is to keep using a random seed per benchmark measurement.

### Changes

* Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different.
* Changed the default seed for `device_benchmark_segmented_reduce`.

### Fixes

Expand Down
63 changes: 36 additions & 27 deletions benchmark/benchmark_block_adjacent_difference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,13 @@ struct subtract_right_partial
}
};

template <class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100>
auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
template<class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100>
auto run_benchmark(benchmark::State& state, size_t N, const managed_seed& seed, hipStream_t stream)
-> std::enable_if_t<!std::is_same<Benchmark, subtract_left_partial>::value
&& !std::is_same<Benchmark, subtract_right_partial>::value>
{
Expand All @@ -237,7 +237,7 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
// Round up size to the next multiple of items_per_block
const auto size = num_blocks * items_per_block;

const std::vector<T> input = get_random_data<T>(size, T(0), T(10));
const std::vector<T> input = get_random_data<T>(size, T(0), T(10), seed.get_0());
T* d_input;
T* d_output;
HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0])));
Expand Down Expand Up @@ -287,13 +287,13 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
HIP_CHECK(hipFree(d_output));
}

template <class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100>
auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
template<class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100>
auto run_benchmark(benchmark::State& state, size_t N, const managed_seed& seed, hipStream_t stream)
-> std::enable_if_t<std::is_same<Benchmark, subtract_left_partial>::value
|| std::is_same<Benchmark, subtract_right_partial>::value>
{
Expand All @@ -302,9 +302,9 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
// Round up size to the next multiple of items_per_block
const auto size = num_blocks * items_per_block;

const std::vector<T> input = get_random_data<T>(size, T(0), T(10));
const std::vector<T> input = get_random_data<T>(size, T(0), T(10), seed.get_0());
const std::vector<unsigned int> tile_sizes
= get_random_data<unsigned int>(num_blocks, 0, items_per_block);
= get_random_data<unsigned int>(num_blocks, 0, items_per_block, seed.get_1());

T* d_input;
unsigned int* d_tile_sizes;
Expand Down Expand Up @@ -372,8 +372,9 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
",with_tile:" #WITH_TILE "}}") \
.c_str(), \
run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
stream, \
size)
size, \
seed, \
stream)

#define BENCHMARK_TYPE(type, block, with_tile) \
CREATE_BENCHMARK(type, block, 1, with_tile), \
Expand All @@ -383,12 +384,12 @@ auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
CREATE_BENCHMARK(type, block, 16, with_tile), \
CREATE_BENCHMARK(type, block, 32, with_tile)


template<class Benchmark>
void add_benchmarks(const std::string& name,
void add_benchmarks(const std::string& name,
std::vector<benchmark::internal::Benchmark*>& benchmarks,
hipStream_t stream,
size_t size)
size_t size,
const managed_seed& seed,
hipStream_t stream)
{
std::vector<benchmark::internal::Benchmark*> bs =
{
Expand Down Expand Up @@ -423,27 +424,35 @@ int main(int argc, char *argv[])
"name_format",
"human",
"either: json,human,txt");
parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
parser.run_and_exit_if_error();

// Parse argv
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
const std::string seed_type = parser.get<std::string>("seed");
const managed_seed seed(seed_type);

// HIP
hipStream_t stream = 0; // default

// Benchmark info
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));
benchmark::AddCustomContext("seed", seed_type);

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks;
add_benchmarks<subtract_left>("subtract_left", benchmarks, stream, size);
add_benchmarks<subtract_right>("subtract_right", benchmarks, stream, size);
add_benchmarks<subtract_left_partial>("subtract_left_partial", benchmarks, stream, size);
add_benchmarks<subtract_right_partial>("subtract_right_partial", benchmarks, stream, size);
add_benchmarks<subtract_left>("subtract_left", benchmarks, size, seed, stream);
add_benchmarks<subtract_right>("subtract_right", benchmarks, size, seed, stream);
add_benchmarks<subtract_left_partial>("subtract_left_partial", benchmarks, size, seed, stream);
add_benchmarks<subtract_right_partial>("subtract_right_partial",
benchmarks,
size,
seed,
stream);

// Use manual timing
for(auto& b : benchmarks)
Expand Down
41 changes: 22 additions & 19 deletions benchmark/benchmark_block_discontinuity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,20 +195,18 @@ struct flag_heads_and_tails
}
};

template<
class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100
>
void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
template<class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
bool WithTile,
unsigned int Trials = 100>
void run_benchmark(benchmark::State& state, size_t N, const managed_seed& seed, hipStream_t stream)
{
constexpr auto items_per_block = BlockSize * ItemsPerThread;
const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);

std::vector<T> input = get_random_data<T>(size, T(0), T(10));
std::vector<T> input = get_random_data<T>(size, T(0), T(10), seed.get_0());
T * d_input;
T * d_output;
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
Expand Down Expand Up @@ -266,8 +264,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
",with_tile:" #WITH_TILE "}}") \
.c_str(), \
run_benchmark<Benchmark, T, BS, IPT, WITH_TILE>, \
stream, \
size)
size, \
seed, \
stream)

#define BENCHMARK_TYPE(type, block, bool) \
CREATE_BENCHMARK(type, block, 1, bool), \
Expand All @@ -276,12 +275,12 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
CREATE_BENCHMARK(type, block, 4, bool), \
CREATE_BENCHMARK(type, block, 8, bool)


template<class Benchmark>
void add_benchmarks(const std::string& name,
void add_benchmarks(const std::string& name,
std::vector<benchmark::internal::Benchmark*>& benchmarks,
hipStream_t stream,
size_t size)
size_t size,
const managed_seed& seed,
hipStream_t stream)
{
std::vector<benchmark::internal::Benchmark*> bs =
{
Expand Down Expand Up @@ -309,26 +308,30 @@ int main(int argc, char *argv[])
"name_format",
"human",
"either: json,human,txt");
parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
parser.run_and_exit_if_error();

// Parse argv
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
const std::string seed_type = parser.get<std::string>("seed");
const managed_seed seed(seed_type);

// HIP
hipStream_t stream = 0; // default

// Benchmark info
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));
benchmark::AddCustomContext("seed", seed_type);

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks;
add_benchmarks<flag_heads>("flag_heads", benchmarks, stream, size);
add_benchmarks<flag_tails>("flag_tails", benchmarks, stream, size);
add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", benchmarks, stream, size);
add_benchmarks<flag_heads>("flag_heads", benchmarks, size, seed, stream);
add_benchmarks<flag_tails>("flag_tails", benchmarks, size, seed, stream);
add_benchmarks<flag_heads_and_tails>("flag_heads_and_tails", benchmarks, size, seed, stream);

// Use manual timing
for(auto& b : benchmarks)
Expand Down
52 changes: 32 additions & 20 deletions benchmark/benchmark_block_exchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,12 @@ struct scatter_to_striped
}
};

template<
class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Trials = 100
>
void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
template<class Benchmark,
class T,
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Trials = 100>
void run_benchmark(benchmark::State& state, size_t N, const managed_seed& seed, hipStream_t stream)
{
constexpr auto items_per_block = BlockSize * ItemsPerThread;
const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block);
Expand All @@ -261,7 +259,7 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
}
std::vector<unsigned int> ranks(size);
// Fill ranks (for scatter operations)
std::mt19937 gen;
engine_type gen(seed.get_0());
for(size_t bi = 0; bi < size / items_per_block; bi++)
{
auto block_ranks = ranks.begin() + bi * items_per_block;
Expand Down Expand Up @@ -334,8 +332,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
+ ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \
.c_str(), \
run_benchmark<Benchmark, T, BS, IPT>, \
stream, \
size)
size, \
seed, \
stream)

#define BENCHMARK_TYPE(type, block) \
CREATE_BENCHMARK(type, block, 1), \
Expand All @@ -346,10 +345,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
CREATE_BENCHMARK(type, block, 8)

template<class Benchmark>
void add_benchmarks(const std::string& name,
void add_benchmarks(const std::string& name,
std::vector<benchmark::internal::Benchmark*>& benchmarks,
hipStream_t stream,
size_t size)
size_t size,
const managed_seed& seed,
hipStream_t stream)
{
using custom_float2 = custom_type<float, float>;
using custom_double2 = custom_type<double, double>;
Expand Down Expand Up @@ -379,29 +379,41 @@ int main(int argc, char *argv[])
"name_format",
"human",
"either: json,human,txt");
parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
parser.run_and_exit_if_error();

// Parse argv
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
const std::string seed_type = parser.get<std::string>("seed");
const managed_seed seed(seed_type);

// HIP
hipStream_t stream = 0; // default

// Benchmark info
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));
benchmark::AddCustomContext("seed", seed_type);

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks;
add_benchmarks<blocked_to_striped>("blocked_to_striped", benchmarks, stream, size);
add_benchmarks<striped_to_blocked>("striped_to_blocked", benchmarks, stream, size);
add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped", benchmarks, stream, size);
add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked", benchmarks, stream, size);
add_benchmarks<scatter_to_blocked>("scatter_to_blocked", benchmarks, stream, size);
add_benchmarks<scatter_to_striped>("scatter_to_striped", benchmarks, stream, size);
add_benchmarks<blocked_to_striped>("blocked_to_striped", benchmarks, size, seed, stream);
add_benchmarks<striped_to_blocked>("striped_to_blocked", benchmarks, size, seed, stream);
add_benchmarks<blocked_to_warp_striped>("blocked_to_warp_striped",
benchmarks,
size,
seed,
stream);
add_benchmarks<warp_striped_to_blocked>("warp_striped_to_blocked",
benchmarks,
size,
seed,
stream);
add_benchmarks<scatter_to_blocked>("scatter_to_blocked", benchmarks, size, seed, stream);
add_benchmarks<scatter_to_striped>("scatter_to_striped", benchmarks, size, seed, stream);

// Use manual timing
for(auto& b : benchmarks)
Expand Down
Loading

0 comments on commit 9148c77

Please sign in to comment.