-
Notifications
You must be signed in to change notification settings - Fork 290
Pool3d fwd #697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Pool3d fwd #697
Changes from all commits
Commits
Show all changes
48 commits
Select commit
Hold shift + click to select a range
8332414
Expand the base class of pool2d, prepare to share base class with pool3d
rocking5566 912f06d
Add pool3d device op
rocking5566 b60f7d8
Add pool3d f16 example
rocking5566 646f689
Merge branch 'develop' into pool3d_fwd
rocking5566 b75abe7
Refactor the base class. implement generic pooling in the future
rocking5566 ed55271
clang format
rocking5566 9b22bdd
get original index in max pooling
rocking5566 16595e9
Add outputindex to base class
rocking5566 a014aa0
Fix dimension
rocking5566 d575411
Add pooling instance
rocking5566 2422171
Use indexType instead
rocking5566 9255407
Remove useless header
rocking5566 58e912d
Merge branch 'develop' into pool3d_fwd
rocking5566 7b83391
Extract IndexDataType to template
rocking5566 7905cf7
Extract pooling reference code
rocking5566 03e4f7e
clang format
rocking5566 ba9c511
clang format
rocking5566 9621381
Fix typo
rocking5566 4eb307e
Add tensor stride
rocking5566 102b492
Add missing header
rocking5566 3c8d984
Add index stride and output stride
rocking5566 21ee92f
Merge branch 'develop' into pool3d_fwd
rocking5566 2280e5b
Refine naming
rocking5566 158ec14
Add type to base class
rocking5566 e74b1dc
Rename file
rocking5566 6f2ef10
Use proper size
rocking5566 91669fc
Fix typo
rocking5566 382c2ca
Refine naming
rocking5566 586d61e
Modify the argument into vector.
rocking5566 b31307e
Add max pool profiler
rocking5566 b798c73
Refine naming
rocking5566 376a804
Support f32 pool
rocking5566 df43a05
Fix typo
rocking5566 44318b0
Add avg pool2d fwd in profiler
rocking5566 c89fb58
clang format
rocking5566 e1914e7
Rename AccDatatype to ComputeDatatype
rocking5566 5c92615
Fix init
rocking5566 a465273
test pool
rocking5566 814fe1b
Extract variable
rocking5566 fe8ed3d
Add client example
rocking5566 d22124d
Check the pooling dim
rocking5566 254e48a
clang format
rocking5566 6fd629d
Connect argv and arg_parser
rocking5566 75725f7
Add found check
rocking5566 084588c
Remove useless header
rocking5566 7c42995
Refine naming
rocking5566 3fa7f1e
Merge branch 'develop' into pool3d_fwd
rocking5566 91e3748
Adjust the order of device_pool_fwd
rocking5566 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp) | ||
| target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations) | ||
|
|
||
| add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp) | ||
| target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| // SPDX-License-Identifier: MIT | ||
| // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. | ||
|
|
||
| #include <iomanip> | ||
| #include <vector> | ||
| #include <iostream> | ||
|
|
||
| #include "ck/ck.hpp" | ||
| #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" | ||
| #include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp" | ||
| #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" | ||
|
|
||
| #include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp" | ||
|
|
||
| using InDataType = ck::half_t; | ||
| using OutDataType = ck::half_t; | ||
| using IndexDataType = int32_t; | ||
|
|
||
| constexpr ck::index_t InOutRank = 5; | ||
| constexpr ck::index_t WindowRank = 3; | ||
| #if 0 | ||
| constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; | ||
| constexpr bool OutputIndex = false; | ||
| #else | ||
| constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; | ||
| constexpr bool OutputIndex = false; | ||
| #endif | ||
|
|
||
| struct SimpleDeviceMem | ||
| { | ||
| SimpleDeviceMem() = delete; | ||
|
|
||
| SimpleDeviceMem(std::size_t mem_size) : p_mem_{} | ||
| { | ||
| (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size); | ||
| } | ||
|
|
||
| void* GetDeviceBuffer() { return p_mem_; } | ||
|
|
||
| ~SimpleDeviceMem() { (void)hipFree(p_mem_); } | ||
|
|
||
| void* p_mem_; | ||
| }; | ||
|
|
||
| int main(int argc, char* argv[]) | ||
| { | ||
| ck::index_t N = 2; | ||
| ck::index_t C = 32; | ||
| ck::index_t Z = 2; | ||
| ck::index_t Y = 2; | ||
| ck::index_t X = 2; | ||
| ck::index_t Di = 30; | ||
| ck::index_t Hi = 30; | ||
| ck::index_t Wi = 30; | ||
| ck::index_t window_stride_d = 2; | ||
| ck::index_t window_stride_h = 2; | ||
| ck::index_t window_stride_w = 2; | ||
| ck::index_t in_left_pad_d = 1; | ||
| ck::index_t in_left_pad_h = 1; | ||
| ck::index_t in_left_pad_w = 1; | ||
| ck::index_t in_right_pad_d = 1; | ||
| ck::index_t in_right_pad_h = 1; | ||
| ck::index_t in_right_pad_w = 1; | ||
|
|
||
| ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1; | ||
| ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; | ||
| ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1; | ||
|
|
||
| // Pool API only support the order of NCDHW | ||
| std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi}; | ||
| std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo}; | ||
| std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X}; | ||
| std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w}; | ||
| std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w}; | ||
| std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w}; | ||
|
|
||
| std::size_t in_tensor_size = N * C * Di * Hi * Wi; | ||
| std::size_t out_tensor_size = N * C * Do * Ho * Wo; | ||
|
|
||
| // tensor layout = NDHWC | ||
| std::vector<ck::index_t> in_tensor_stride = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C}; | ||
| std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C}; | ||
|
|
||
| SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size); | ||
| SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size); | ||
| SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size); | ||
|
|
||
| using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank, | ||
| WindowRank, | ||
| InDataType, | ||
| OutDataType, | ||
| IndexDataType, | ||
| ReduceOpId, | ||
| OutputIndex>; | ||
|
|
||
| // get device op instances | ||
| const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< | ||
| DeviceOp>::GetInstances(); | ||
|
|
||
| std::cout << "found " << op_ptrs.size() << " instances" << std::endl; | ||
|
|
||
| std::string best_op_name; | ||
| bool found = false; | ||
| int best_op_id = -1; | ||
| float best_ave_time = std::numeric_limits<float>::max(); | ||
| float best_gb_per_sec = 0; | ||
|
|
||
| // profile device operation instances | ||
| std::cout << "Run all instances and do timing" << std::endl; | ||
|
|
||
| for(int i = 0; i < op_ptrs.size(); ++i) | ||
| { | ||
| auto& op_ptr = op_ptrs[i]; | ||
| auto argument_ptr = op_ptr->MakeArgumentPointer( | ||
| static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), | ||
| static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), | ||
| static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()), | ||
| in_length, | ||
| window_spatial_lengths, | ||
| out_length, | ||
| in_tensor_stride, | ||
| out_tensor_stride, | ||
| out_tensor_stride, | ||
| window_strides, | ||
| input_left_pads, | ||
| input_right_pads, | ||
| {2, 3, 4}); | ||
|
|
||
| auto invoker_ptr = op_ptr->MakeInvokerPointer(); | ||
|
|
||
| std::string op_name = op_ptr->GetTypeString(); | ||
|
|
||
| if(op_ptr->IsSupportedArgument(argument_ptr.get())) | ||
| { | ||
| float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); | ||
|
|
||
| std::size_t num_bytes = | ||
| in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType); | ||
|
|
||
| if constexpr(OutputIndex) | ||
| num_bytes += out_tensor_size * sizeof(IndexDataType); | ||
|
|
||
| float gb_per_sec = num_bytes / 1.E6 / ave_time; | ||
|
|
||
| std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " | ||
| << op_name << std::endl; | ||
|
|
||
| if(ave_time < best_ave_time) | ||
| { | ||
| found = true; | ||
| best_op_id = i; | ||
| best_op_name = op_name; | ||
| best_ave_time = ave_time; | ||
| best_gb_per_sec = gb_per_sec; | ||
| } | ||
| } | ||
| else | ||
| { | ||
| std::cout << op_name << " does not support this problem" << std::endl; | ||
| } | ||
| } | ||
|
|
||
| // run the best intance | ||
| if(found) | ||
| { | ||
| std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " | ||
| << best_op_name << std::endl; | ||
|
|
||
| auto& op_ptr = op_ptrs[best_op_id]; | ||
| std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() | ||
| << std::endl; | ||
|
|
||
| auto argument_ptr = op_ptr->MakeArgumentPointer( | ||
| static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), | ||
| static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), | ||
| static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()), | ||
| in_length, | ||
| window_spatial_lengths, | ||
| out_length, | ||
| in_tensor_stride, | ||
| out_tensor_stride, | ||
| out_tensor_stride, | ||
| window_strides, | ||
| input_left_pads, | ||
| input_right_pads, | ||
| {2, 3, 4}); | ||
|
|
||
| auto invoker_ptr = op_ptr->MakeInvokerPointer(); | ||
|
|
||
| if(op_ptr->IsSupportedArgument(argument_ptr.get())) | ||
| { | ||
| invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); | ||
| } | ||
|
|
||
| std::cout << "Done" << std::endl; | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.