Skip to content

Commit

Permalink
Merge #6286
Browse files Browse the repository at this point in the history
6286: Simd helpers r=hkaiser a=Johan511

simd helpers based on intel implementation


Co-authored-by: Hari Hara Naveen S <johan511@rostam1.rostam.cct.lsu.edu>
Co-authored-by: Hari Hara Naveen S <johan511@medusa11.rostam.cct.lsu.edu>
  • Loading branch information
3 people committed Jul 23, 2023
2 parents 96c3f4a + 8d9a628 commit 09eabc7
Show file tree
Hide file tree
Showing 6 changed files with 313 additions and 6 deletions.
1 change: 1 addition & 0 deletions libs/core/algorithms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ set(algorithms_headers
hpx/parallel/unseq/loop.hpp
hpx/parallel/unseq/reduce.hpp
hpx/parallel/unseq/reduce_helpers.hpp
hpx/parallel/unseq/simd_helpers.hpp
hpx/parallel/unseq/transform_loop.hpp
hpx/parallel/util/adapt_placement_mode.hpp
hpx/parallel/util/adapt_sharing_mode.hpp
Expand Down
1 change: 1 addition & 0 deletions libs/core/algorithms/include/hpx/parallel/unseq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@

#include <hpx/parallel/unseq/loop.hpp>
#include <hpx/parallel/unseq/reduce.hpp>
#include <hpx/parallel/unseq/simd_helpers.hpp>
#include <hpx/parallel/unseq/transform_loop.hpp>
161 changes: 161 additions & 0 deletions libs/core/algorithms/include/hpx/parallel/unseq/simd_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
// Copyright (c) 2022 A Kishore Kumar
// Copyright (c) 2023 Hartmut Kaiser
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#pragma once

#include <hpx/config.hpp>
#include <hpx/functional/detail/invoke.hpp>
#include <hpx/iterator_support/traits/is_iterator.hpp>
#include <hpx/parallel/unseq/reduce_helpers.hpp>
#include <hpx/type_support/construct_at.hpp>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <type_traits>
#include <utility>

// Please use static assert and enforce Iter to be Random Access Iterator
namespace hpx::parallel::util {
/*
Compiler and Hardware should also support vector operations for IterDiff,
else we see slower performance when compared to sequential version
*/
template <typename Iter, typename IterDiff, typename F>
Iter unseq_first_n(Iter const first, IterDiff const n, F&& f) noexcept
{
/*
OMP loops can not have ++Iter, only integral types are allowed
Hence perform arthemetic on Iterators
which is O(1) only in case of random access iterators
*/
static_assert(hpx::traits::is_random_access_iterator_v<Iter>,
"algorithm is efficient only in case of Random Access Iterator");
#if HPX_EARLYEXIT_PRESENT
IterDiff i = 0;
// clang-format off
HPX_PRAGMA_VECTOR_UNALIGNED HPX_PRAGMA_SIMD_EARLYEXIT
for (; i < n; ++i)
{
if (f(*(first + i)))
{
break;
}
}
// clang-format on

return first + i;
#else
// std::int32_t has best support for vectorization from compilers and hardware
IterDiff i = 0;
static constexpr std::int32_t num_blocks =
HPX_LANE_SIZE / sizeof(std::int32_t);
alignas(HPX_LANE_SIZE) std::int32_t simd_lane[num_blocks] = {0};
while (i <= n - num_blocks)
{
std::int32_t found_flag = 0;

// clang-format off
HPX_PRAGMA_VECTOR_UNALIGNED HPX_VECTOR_REDUCTION(| : found_flag)
for (IterDiff j = i; j < i + num_blocks; ++j)
{
std::int32_t const t = f(*(first + j));
simd_lane[j - i] = t;
found_flag |= t;
}
// clang-format on

if (found_flag)
{
IterDiff j;
for (j = 0; j < num_blocks; ++j)
{
if (simd_lane[j])
{
break;
}
}
return first + i + j;
}
i += num_blocks;
}

//Keep remainder scalar
while (i != n)
{
if (f(*(first + i)))
{
break;
}
++i;
}
return first + i;
#endif //HPX_EARLYEXIT_PRESENT
}

template <typename Iter1, typename Iter2, typename IterDiff, typename F>
std::pair<Iter1, Iter2> unseq2_first_n(Iter1 const first1,
Iter2 const first2, IterDiff const n, F&& f) noexcept
{
#if HPX_EARLYEXIT_PRESENT
IterDiff i = 0;

// clang-format off
HPX_PRAGMA_VECTOR_UNALIGNED HPX_PRAGMA_SIMD_EARLYEXIT
for (; i < n; ++i)
if (f(*(first1 + i), *(first2 + i)))
break;
// clang-format on

return std::make_pair(first1 + i, first2 + i);
#else

static constexpr std::int32_t num_blocks =
HPX_LANE_SIZE / sizeof(std::int32_t);
alignas(HPX_LANE_SIZE) std::int32_t simd_lane[num_blocks] = {0};

IterDiff outer_loop_ind = 0;
while (outer_loop_ind <= n - num_blocks)
{
std::int32_t found_flag = 0;
IterDiff i;

// clang-format off
HPX_PRAGMA_VECTOR_UNALIGNED HPX_VECTOR_REDUCTION(| : found_flag)
for (i = 0; i < num_blocks; ++i)
{
IterDiff const t = f(*(first1 + outer_loop_ind + i),
*(first2 + outer_loop_ind + i));
simd_lane[i] = t;
found_flag |= t;
}
// clang-format on

if (found_flag)
{
IterDiff i2;
for (i2 = 0; i2 < num_blocks; ++i2)
{
if (simd_lane[i2])
break;
}
return std::make_pair(
first1 + outer_loop_ind + i2, first2 + outer_loop_ind + i2);
}
outer_loop_ind += num_blocks;
}

//Keep remainder scalar
for (; outer_loop_ind != n; ++outer_loop_ind)
if (f(*(first1 + outer_loop_ind), *(first2 + outer_loop_ind)))
break;

return std::make_pair(first1 + outer_loop_ind, first2 + outer_loop_ind);
#endif //HPX_EARLYEXIT_PRESENT
}
} // namespace hpx::parallel::util
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

set(tests test_low_level test_merge_four test_merge_vector test_nbits
test_range
test_range test_simd_helpers
)

foreach(test ${tests})
Expand Down
105 changes: 105 additions & 0 deletions libs/core/algorithms/tests/unit/algorithms/util/test_simd_helpers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright (c) 2023 Johan511

// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/modules/testing.hpp>
#include <hpx/parallel/unseq/simd_helpers.hpp>

#include <algorithm>
#include <cstddef>
#include <random>
#include <utility>
#include <vector>

int seed = std::random_device{}();
std::mt19937 gen(seed);

using hpx::parallel::util::unseq_first_n, hpx::parallel::util::unseq2_first_n;

template <typename T>
void test_unseq_first_n1_dispatch2(std::size_t length, std::size_t first_index)
{
first_index = first_index % length;

std::vector<T> v(length, static_cast<T>(false));
std::size_t i = 0;

std::for_each(v.begin(), v.end(), [&](T& t) {
if (i == first_index)
t = 1;
else if (i > first_index)
t = gen() % 2;
else
t = 0;
i++;
});

auto f = [](T t) { return t; };

auto iter_test = hpx::parallel::util::unseq_first_n(
v.begin(), static_cast<T>(length), f);

auto iter_known = v.begin() + first_index;

HPX_TEST(iter_test == iter_known);
}

void test_unseq_first_n1_dispatch1()
{
test_unseq_first_n1_dispatch2<int>(gen() % 10007, gen());
}

template <typename T>
void test_unseq_first_n2_dispatch2(std::size_t length, std::size_t first_index)
{
first_index = first_index % length;
std::vector<T> v1(length, static_cast<T>(false));
std::vector<T> v2(length, static_cast<T>(false));

std::size_t idx = 0;

while (idx != length)
{
if (idx == first_index)
{
v1[idx] = 1;
v2[idx] = 1;
}
else if (idx > first_index)
{
v1[idx] = gen() % 2;
v2[idx] = gen() % 2;
}
else
{
v1[idx] = 0;
v2[idx] = 0;
}
idx++;
}

auto f = [](T t1, T t2) { return t1 && t2; };

auto iter_pair_test = hpx::parallel::util::unseq2_first_n(
v1.begin(), v2.begin(), static_cast<T>(length), f);

auto iter_pair_value =
std::make_pair(v1.begin() + first_index, v2.begin() + first_index);

HPX_TEST(iter_pair_test == iter_pair_value);
}

void test_unseq_first_n2_dispatch1()
{
test_unseq_first_n2_dispatch2<int>(gen() % 10007, gen());
}

int main(int, char*[])
{
test_unseq_first_n1_dispatch1(); // Predicate takes single argument
test_unseq_first_n2_dispatch1(); // Predicate takes two arguments

return hpx::util::report_errors();
}
49 changes: 44 additions & 5 deletions libs/core/config/include/hpx/config/auto_vectorization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,42 @@
#define HPX_PRAGMA(x) _Pragma(#x)
#endif

// Use OpenMP backend for compilers that support OpenMP
#if (_OPENMP >= 201307) || (__INTEL_COMPILER >= 1600) || \
(defined(__clang__) && HPX_CLANG_VERSION >= 30700)
#if (__INTEL_COMPILER >= 1600)

// version specific pragmas to be defined at the beginning
#if (__INTEL_COMPILER >= 1800)
#define HPX_EARLYEXIT_PRESENT
#define HPX_PRAGMA_SIMD_EARLYEXIT HPX_PRAGMA(omp simd early_exit)
#else
#define HPX_PRAGMA_SIMD_EARLYEXIT
#endif

#define HPX_IVDEP
#define HPX_PRAGMA_VECTOR_UNALIGNED HPX_PRAGMA(vector unaligned)
#define HPX_VECTORIZE HPX_PRAGMA(omp simd)
#define HPX_VECTOR_REDUCTION(CLAUSE) HPX_PRAGMA(omp simd reduction(CLAUSE))
#define HPX_DECLARE_SIMD HPX_PRAGMA(omp declare simd)

#define HPX_RESTRICT
#define HPX_UNROLL
#define HPX_UNROLL_N(N)

#define HPX_HAVE_VECTOR_REDUCTION

#elif (_OPENMP >= 201307) || (defined(__clang__) && HPX_CLANG_VERSION >= 30700)

#define HPX_PRAGMA_SIMD_EARLYEXIT
#define HPX_IVDEP
#define HPX_VECTORIZE HPX_PRAGMA(omp simd)
#define HPX_VECTOR_REDUCTION(CLAUSE) HPX_PRAGMA(omp simd reduction(CLAUSE))
#define HPX_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd)
#define HPX_DECLARE_SIMD HPX_PRAGMA(omp declare simd)

#define HPX_RESTRICT
#define HPX_UNROLL HPX_PRAGMA(omp simd)
#define HPX_UNROLL
#define HPX_UNROLL_N(N)

#define HPX_PRAGMA_VECTOR_UNALIGNED

#define HPX_HAVE_VECTOR_REDUCTION

// Fallback to compiler-specific back-ends
Expand All @@ -48,6 +72,9 @@
#define HPX_VECTOR_REDUCTION(CLAUSE)
#define HPX_DECLARE_SIMD

#define HPX_PRAGMA_VECTOR_UNALIGNED
#define HPX_PRAGMA_SIMD_EARLYEXIT

#define HPX_RESTRICT __restrict
#define HPX_UNROLL HPX_PRAGMA(unroll)
#define HPX_UNROLL_N(N) HPX_PRAGMA(unroll(N))
Expand All @@ -60,6 +87,9 @@
#define HPX_VECTOR_REDUCTION(CLAUSE)
#define HPX_DECLARE_SIMD

#define HPX_PRAGMA_VECTOR_UNALIGNED
#define HPX_PRAGMA_SIMD_EARLYEXIT

#define HPX_RESTRICT __restrict
#define HPX_UNROLL HPX_PRAGMA(clang loop unroll(enable))
#define HPX_UNROLL_N(N) HPX_PRAGMA(clang loop unroll_count(N))
Expand All @@ -72,6 +102,9 @@
#define HPX_VECTOR_REDUCTION(CLAUSE)
#define HPX_DECLARE_SIMD

#define HPX_PRAGMA_VECTOR_UNALIGNED
#define HPX_PRAGMA_SIMD_EARLYEXIT

#define HPX_RESTRICT __restrict__
// GCC does not have an auto unroll constant picker
#define HPX_UNROLL HPX_PRAGMA(GCC unroll 8)
Expand All @@ -85,6 +118,9 @@
#define HPX_VECTOR_REDUCTION(CLAUSE)
#define HPX_DECLARE_SIMD

#define HPX_PRAGMA_VECTOR_UNALIGNED
#define HPX_PRAGMA_SIMD_EARLYEXIT

#define HPX_RESTRICT
#define HPX_UNROLL
#define HPX_UNROLL_N(N)
Expand All @@ -98,6 +134,9 @@
#define HPX_VECTOR_REDUCTION(CLAUSE)
#define HPX_DECLARE_SIMD

#define HPX_PRAGMA_VECTOR_UNALIGNED
#define HPX_PRAGMA_SIMD_EARLYEXIT

#define HPX_RESTRICT
#define HPX_UNROLL
#define HPX_UNROLL_N(N)
Expand Down

0 comments on commit 09eabc7

Please sign in to comment.