Skip to content

Commit

Permalink
Fix non-SSE4 trunc, floor, and ceil
Browse files Browse the repository at this point in the history
Fixes: gh-200

Signed-off-by: Matthias Kretz <kretz@kde.org>
  • Loading branch information
mattkretz committed Jun 12, 2018
1 parent de75580 commit 288d0d9
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 69 deletions.
101 changes: 32 additions & 69 deletions sse/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,84 +116,47 @@ inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data())
inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
#else
namespace Detail
inline SSE::Vector<float> trunc(SSE::Vector<float> x)
{
static inline void floor_shift(SSE::float_v &v, SSE::float_v::AsArg e)
{
SSE::int_v x = SSE::_mm_setallone_si128();
x <<= 23;
x >>= simd_cast<SSE::int_v>(e);
v = Detail::operator&(v, reinterpret_components_cast<SSE::float_v>(x));
const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
_mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
_mm_set1_epi32(0x4b000000))); // the exponent is so large that no mantissa bits
// signify fractional values (0x3f8 + 23*8 = 0x4b0)
return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
_mm_and_ps(no_fractional_values, truncated));
}

static inline void floor_shift(SSE::double_v &v, SSE::double_v::AsArg e)
inline SSE::Vector<double> trunc(SSE::Vector<double> x)
{
const long long initialMask = 0xfff0000000000000ull;
const SSE::uint_v shifts = simd_cast<SSE::uint_v>(e);
union d_ll
{
long long ll;
double d;
};
d_ll mask0 = {initialMask >> shifts[0]};
d_ll mask1 = {initialMask >> shifts[1]};
v = Detail::operator&(v, SSE::double_v(_mm_setr_pd(mask0.d, mask1.d)));
const auto abs_x = Vc::abs(x).data();
const auto min_no_fractional_bits =
_mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull)); // 0x3ff + 52 = 0x433
__m128d truncated =
_mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
// due to rounding, the result can be too large. In this case `truncated >
// abs(x)` holds, so subtract 1 to truncated if `abs(x) < truncated`
truncated = _mm_sub_pd(truncated,
_mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
// finally, fix the sign bit:
return _mm_or_pd(
_mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
truncated);
}
} // namespace Detail

template <typename T> inline SSE::Vector<T> trunc(SSE::Vector<T> _v)
template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
{
typedef SSE::Vector<T> V;
typedef typename V::Mask M;

V v = _v;
V e = exponent(abs(v));
const M negativeExponent = e < 0;
e.setZero(negativeExponent);
//const M negativeInput = v < V::Zero();

Detail::floor_shift(v, e);

v.setZero(negativeExponent);
//v(negativeInput && _v != v) -= V::One();
return v;
}

template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> _v)
{
typedef SSE::Vector<T> V;
typedef typename V::Mask M;

V v = _v;
V e = exponent(abs(v));
const M negativeExponent = e < 0;
e.setZero(negativeExponent);
const M negativeInput = v < V::Zero();

Detail::floor_shift(v, e);

v.setZero(negativeExponent);
v(negativeInput && _v != v) -= V::One();
return v;
}
auto y = trunc(x);
y(!(y == x) && x < 0) -= 1;
return y;
}

template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> _v)
template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
{
typedef SSE::Vector<T> V;
typedef typename V::Mask M;

V v = _v;
V e = exponent(abs(v));
const M negativeExponent = e < 0;
e.setZero(negativeExponent);
const M positiveInput = v > V::Zero();

Detail::floor_shift(v, e);

v.setZero(negativeExponent);
v(positiveInput && _v != v) += V::One();
return v;
}
auto y = trunc(x);
y(!(y == x || x < 0)) += 1;
return y;
}
#endif
// fma {{{1
template <typename T>
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ vc_add_test(ulp)
vc_add_test(logarithm)
vc_add_test(trigonometric)
vc_add_test(math)
vc_add_test(gh200)
vc_add_test(reductions)
vc_add_test(mask)
vc_add_test(utils)
Expand Down
13 changes: 13 additions & 0 deletions tests/gh200.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "unittest.h"
#include <Vc/Vc>

TEST_TYPES(V, rounding, (RealVectors))
{
using T = typename V::value_type;
for (T d : { 1e100, 2.1, 2.9, 1.0, 2.0, 3.0, -2.1, -2.9, -1e100 }) {
V v(d);
COMPARE(Vc::trunc(v), V(std::trunc(d))) << "d = " << d;
COMPARE(Vc::floor(v), V(std::floor(d))) << "d = " << d;
COMPARE(Vc::ceil(v), V(std::ceil(d))) << "d = " << d;
}
}

0 comments on commit 288d0d9

Please sign in to comment.