Use short->int cvt instructions when possible

Signed-off-by: Matthias Kretz <kretz@kde.org>
VcDevel · Oct 25, 2018 · 6fb56ab · 6fb56ab
1 parent 17d7fee
commit 6fb56ab
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 20 deletions.
diff --git a/Vc/avx/simd_cast.h b/Vc/avx/simd_cast.h
@@ -1246,14 +1246,8 @@ Vc_SIMD_CAST_AVX_2(double_v,  float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.d
 Vc_SIMD_CAST_AVX_1(double_v,    int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); }
 Vc_SIMD_CAST_AVX_1( float_v,    int_v) { return _mm256_cvttps_epi32(x.data()); }
 Vc_SIMD_CAST_AVX_1(  uint_v,    int_v) { return x.data(); }
-Vc_SIMD_CAST_AVX_1( short_v,    int_v) {
-    const auto tmp = Mem::permute4x64<X0, X2, X1, X3>(x.data());
-    return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16);
-}
-Vc_SIMD_CAST_AVX_1(ushort_v,    int_v) {
-    const auto tmp = Mem::permute4x64<X0, X2, X1, X3>(x.data());
-    return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16);
-}
+Vc_SIMD_CAST_AVX_1( short_v,    int_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
+Vc_SIMD_CAST_AVX_1(ushort_v,    int_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
 #endif
 
 // 2: to int_v {{{3
@@ -1273,14 +1267,8 @@ Vc_SIMD_CAST_AVX_1( float_v,   uint_v) {
         _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps())));
 }
 Vc_SIMD_CAST_AVX_1(   int_v,   uint_v) { return x.data(); }
-Vc_SIMD_CAST_AVX_1( short_v,   uint_v) {
-    const auto tmp = Mem::permute4x64<X0, X2, X1, X3>(x.data());
-    return _mm256_srai_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16);
-}
-Vc_SIMD_CAST_AVX_1(ushort_v,   uint_v) {
-    const auto tmp = Mem::permute4x64<X0, X2, X1, X3>(x.data());
-    return _mm256_srli_epi32(_mm256_unpacklo_epi16(tmp, tmp), 16);
-}
+Vc_SIMD_CAST_AVX_1( short_v,   uint_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
+Vc_SIMD_CAST_AVX_1(ushort_v,   uint_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
 #endif
 
 // 2: to uint_v {{{3

diff --git a/Vc/sse/casts.h b/Vc/sse/casts.h
@@ -68,8 +68,20 @@ Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , int   >) { return _mm
 Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int   >) { return _mm_cvttpd_epi32(v); }
 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , int   >) { return v; }
 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , int   >) { return v; }
-Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int   >) { return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16); }
-Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int   >) { return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16); }
+Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int   >) {
+#ifdef Vc_IMPL_SSE4_1
+    return _mm_cvtepi16_epi32(v);
+#else
+    return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16);
+#endif
+}
+Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int   >) {
+#ifdef Vc_IMPL_SSE4_1
+    return _mm_cvtepu16_epi32(v);
+#else
+    return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16);
+#endif
+}
 Vc_INTRINSIC __m128i convert(__m128  v, ConvertTag<float , uint  >) {
     return _mm_castps_si128(
         blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
@@ -91,8 +103,8 @@ Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint  >) {
 }
 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int   , uint  >) { return v; }
 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint  , uint  >) { return v; }
-Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint  >) { return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16); }
-Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint  >) { return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16); }
+Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint  >) { return convert(v, ConvertTag<short, int>()); }
+Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint  >) { return convert(v, ConvertTag<ushort, int>()); }
 Vc_INTRINSIC __m128  convert(__m128  v, ConvertTag<float , float >) { return v; }
 Vc_INTRINSIC __m128  convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
 Vc_INTRINSIC __m128  convert(__m128i v, ConvertTag<int   , float >) { return _mm_cvtepi32_ps(v); }