-
-
Notifications
You must be signed in to change notification settings - Fork 424
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
6286: Simd helpers r=hkaiser a=Johan511 simd helpers based on intel implementation Co-authored-by: Hari Hara Naveen S <johan511@rostam1.rostam.cct.lsu.edu> Co-authored-by: Hari Hara Naveen S <johan511@medusa11.rostam.cct.lsu.edu>
- Loading branch information
Showing
6 changed files
with
313 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
161 changes: 161 additions & 0 deletions
161
libs/core/algorithms/include/hpx/parallel/unseq/simd_helpers.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
// Copyright (c) 2022 A Kishore Kumar | ||
// Copyright (c) 2023 Hartmut Kaiser | ||
// | ||
// SPDX-License-Identifier: BSL-1.0 | ||
// Distributed under the Boost Software License, Version 1.0. (See accompanying | ||
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | ||
|
||
#pragma once | ||
|
||
#include <hpx/config.hpp> | ||
#include <hpx/functional/detail/invoke.hpp> | ||
#include <hpx/iterator_support/traits/is_iterator.hpp> | ||
#include <hpx/parallel/unseq/reduce_helpers.hpp> | ||
#include <hpx/type_support/construct_at.hpp> | ||
|
||
#include <algorithm> | ||
#include <cstddef> | ||
#include <cstdint> | ||
#include <memory> | ||
#include <type_traits> | ||
#include <utility> | ||
|
||
// Please use static assert and enforce Iter to be Random Access Iterator | ||
namespace hpx::parallel::util { | ||
/* | ||
Compiler and Hardware should also support vector operations for IterDiff, | ||
else we see slower performance when compared to sequential version | ||
*/ | ||
template <typename Iter, typename IterDiff, typename F> | ||
Iter unseq_first_n(Iter const first, IterDiff const n, F&& f) noexcept | ||
{ | ||
/* | ||
OMP loops can not have ++Iter, only integral types are allowed | ||
Hence perform arthemetic on Iterators | ||
which is O(1) only in case of random access iterators | ||
*/ | ||
static_assert(hpx::traits::is_random_access_iterator_v<Iter>, | ||
"algorithm is efficient only in case of Random Access Iterator"); | ||
#if HPX_EARLYEXIT_PRESENT | ||
IterDiff i = 0; | ||
// clang-format off | ||
HPX_PRAGMA_VECTOR_UNALIGNED HPX_PRAGMA_SIMD_EARLYEXIT | ||
for (; i < n; ++i) | ||
{ | ||
if (f(*(first + i))) | ||
{ | ||
break; | ||
} | ||
} | ||
// clang-format on | ||
|
||
return first + i; | ||
#else | ||
// std::int32_t has best support for vectorization from compilers and hardware | ||
IterDiff i = 0; | ||
static constexpr std::int32_t num_blocks = | ||
HPX_LANE_SIZE / sizeof(std::int32_t); | ||
alignas(HPX_LANE_SIZE) std::int32_t simd_lane[num_blocks] = {0}; | ||
while (i <= n - num_blocks) | ||
{ | ||
std::int32_t found_flag = 0; | ||
|
||
// clang-format off | ||
HPX_PRAGMA_VECTOR_UNALIGNED HPX_VECTOR_REDUCTION(| : found_flag) | ||
for (IterDiff j = i; j < i + num_blocks; ++j) | ||
{ | ||
std::int32_t const t = f(*(first + j)); | ||
simd_lane[j - i] = t; | ||
found_flag |= t; | ||
} | ||
// clang-format on | ||
|
||
if (found_flag) | ||
{ | ||
IterDiff j; | ||
for (j = 0; j < num_blocks; ++j) | ||
{ | ||
if (simd_lane[j]) | ||
{ | ||
break; | ||
} | ||
} | ||
return first + i + j; | ||
} | ||
i += num_blocks; | ||
} | ||
|
||
//Keep remainder scalar | ||
while (i != n) | ||
{ | ||
if (f(*(first + i))) | ||
{ | ||
break; | ||
} | ||
++i; | ||
} | ||
return first + i; | ||
#endif //HPX_EARLYEXIT_PRESENT | ||
} | ||
|
||
template <typename Iter1, typename Iter2, typename IterDiff, typename F> | ||
std::pair<Iter1, Iter2> unseq2_first_n(Iter1 const first1, | ||
Iter2 const first2, IterDiff const n, F&& f) noexcept | ||
{ | ||
#if HPX_EARLYEXIT_PRESENT | ||
IterDiff i = 0; | ||
|
||
// clang-format off | ||
HPX_PRAGMA_VECTOR_UNALIGNED HPX_PRAGMA_SIMD_EARLYEXIT | ||
for (; i < n; ++i) | ||
if (f(*(first1 + i), *(first2 + i))) | ||
break; | ||
// clang-format on | ||
|
||
return std::make_pair(first1 + i, first2 + i); | ||
#else | ||
|
||
static constexpr std::int32_t num_blocks = | ||
HPX_LANE_SIZE / sizeof(std::int32_t); | ||
alignas(HPX_LANE_SIZE) std::int32_t simd_lane[num_blocks] = {0}; | ||
|
||
IterDiff outer_loop_ind = 0; | ||
while (outer_loop_ind <= n - num_blocks) | ||
{ | ||
std::int32_t found_flag = 0; | ||
IterDiff i; | ||
|
||
// clang-format off | ||
HPX_PRAGMA_VECTOR_UNALIGNED HPX_VECTOR_REDUCTION(| : found_flag) | ||
for (i = 0; i < num_blocks; ++i) | ||
{ | ||
IterDiff const t = f(*(first1 + outer_loop_ind + i), | ||
*(first2 + outer_loop_ind + i)); | ||
simd_lane[i] = t; | ||
found_flag |= t; | ||
} | ||
// clang-format on | ||
|
||
if (found_flag) | ||
{ | ||
IterDiff i2; | ||
for (i2 = 0; i2 < num_blocks; ++i2) | ||
{ | ||
if (simd_lane[i2]) | ||
break; | ||
} | ||
return std::make_pair( | ||
first1 + outer_loop_ind + i2, first2 + outer_loop_ind + i2); | ||
} | ||
outer_loop_ind += num_blocks; | ||
} | ||
|
||
//Keep remainder scalar | ||
for (; outer_loop_ind != n; ++outer_loop_ind) | ||
if (f(*(first1 + outer_loop_ind), *(first2 + outer_loop_ind))) | ||
break; | ||
|
||
return std::make_pair(first1 + outer_loop_ind, first2 + outer_loop_ind); | ||
#endif //HPX_EARLYEXIT_PRESENT | ||
} | ||
} // namespace hpx::parallel::util |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
libs/core/algorithms/tests/unit/algorithms/util/test_simd_helpers.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// Copyright (c) 2023 Johan511 | ||
|
||
// SPDX-License-Identifier: BSL-1.0 | ||
// Distributed under the Boost Software License, Version 1.0. (See accompanying | ||
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | ||
|
||
#include <hpx/modules/testing.hpp> | ||
#include <hpx/parallel/unseq/simd_helpers.hpp> | ||
|
||
#include <algorithm> | ||
#include <cstddef> | ||
#include <random> | ||
#include <utility> | ||
#include <vector> | ||
|
||
int seed = std::random_device{}(); | ||
std::mt19937 gen(seed); | ||
|
||
using hpx::parallel::util::unseq_first_n, hpx::parallel::util::unseq2_first_n; | ||
|
||
template <typename T> | ||
void test_unseq_first_n1_dispatch2(std::size_t length, std::size_t first_index) | ||
{ | ||
first_index = first_index % length; | ||
|
||
std::vector<T> v(length, static_cast<T>(false)); | ||
std::size_t i = 0; | ||
|
||
std::for_each(v.begin(), v.end(), [&](T& t) { | ||
if (i == first_index) | ||
t = 1; | ||
else if (i > first_index) | ||
t = gen() % 2; | ||
else | ||
t = 0; | ||
i++; | ||
}); | ||
|
||
auto f = [](T t) { return t; }; | ||
|
||
auto iter_test = hpx::parallel::util::unseq_first_n( | ||
v.begin(), static_cast<T>(length), f); | ||
|
||
auto iter_known = v.begin() + first_index; | ||
|
||
HPX_TEST(iter_test == iter_known); | ||
} | ||
|
||
void test_unseq_first_n1_dispatch1() | ||
{ | ||
test_unseq_first_n1_dispatch2<int>(gen() % 10007, gen()); | ||
} | ||
|
||
template <typename T> | ||
void test_unseq_first_n2_dispatch2(std::size_t length, std::size_t first_index) | ||
{ | ||
first_index = first_index % length; | ||
std::vector<T> v1(length, static_cast<T>(false)); | ||
std::vector<T> v2(length, static_cast<T>(false)); | ||
|
||
std::size_t idx = 0; | ||
|
||
while (idx != length) | ||
{ | ||
if (idx == first_index) | ||
{ | ||
v1[idx] = 1; | ||
v2[idx] = 1; | ||
} | ||
else if (idx > first_index) | ||
{ | ||
v1[idx] = gen() % 2; | ||
v2[idx] = gen() % 2; | ||
} | ||
else | ||
{ | ||
v1[idx] = 0; | ||
v2[idx] = 0; | ||
} | ||
idx++; | ||
} | ||
|
||
auto f = [](T t1, T t2) { return t1 && t2; }; | ||
|
||
auto iter_pair_test = hpx::parallel::util::unseq2_first_n( | ||
v1.begin(), v2.begin(), static_cast<T>(length), f); | ||
|
||
auto iter_pair_value = | ||
std::make_pair(v1.begin() + first_index, v2.begin() + first_index); | ||
|
||
HPX_TEST(iter_pair_test == iter_pair_value); | ||
} | ||
|
||
void test_unseq_first_n2_dispatch1() | ||
{ | ||
test_unseq_first_n2_dispatch2<int>(gen() % 10007, gen()); | ||
} | ||
|
||
int main(int, char*[]) | ||
{ | ||
test_unseq_first_n1_dispatch1(); // Predicate takes single argument | ||
test_unseq_first_n2_dispatch1(); // Predicate takes two arguments | ||
|
||
return hpx::util::report_errors(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters