Skip to content
Permalink
Browse files
[SIMD] Intel support for the remaining conversion opcodes
https://bugs.webkit.org/show_bug.cgi?id=249418
rdar://103411760

Reviewed by Yusuke Suzuki.

Add support for conversion operations:
- i32x4.trunc_sat_f32x4_s(a: v128) -> v128
- i32x4.trunc_sat_f32x4_u(a: v128) -> v128
- f32x4.convert_i32x4_u(a: v128) -> v128
https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#conversions

* Source/JavaScriptCore/assembler/MacroAssemblerX86_64.h:
(JSC::MacroAssemblerX86_64::vectorTruncSat):
(JSC::MacroAssemblerX86_64::vectorTruncSatUnsignedFloat32):
(JSC::MacroAssemblerX86_64::vectorConvertUnsigned):
(JSC::MacroAssemblerX86_64::vectorMulSat):
* Source/JavaScriptCore/assembler/X86Assembler.h:
(JSC::X86Assembler::pblendw_i8rr):
(JSC::X86Assembler::vmaxps_rrr):
(JSC::X86Assembler::vmaxpd_rrr):
(JSC::X86Assembler::vminps_rrr):
(JSC::X86Assembler::vminpd_rrr):
(JSC::X86Assembler::vcmpunordps_rrr):
(JSC::X86Assembler::vcmpleps_rrr):
(JSC::X86Assembler::vcmpltps_rrr):
(JSC::X86Assembler::vcvttps2dq_rr):
(JSC::X86Assembler::vpand_rrr):
(JSC::X86Assembler::vpslld_i8rr):
(JSC::X86Assembler::pblendw_rr): Deleted.
* Source/JavaScriptCore/b3/air/AirOpcode.opcodes:
* Source/JavaScriptCore/wasm/WasmAirIRGenerator64.cpp:
(JSC::Wasm::AirIRGenerator64::addSIMDV_V):
(JSC::Wasm::AirIRGenerator64::addSIMDRelOp):

Canonical link: https://commits.webkit.org/257965@main
  • Loading branch information
hyjorc1 authored and Yijia Huang committed Dec 16, 2022
1 parent 199b720 commit a068c52c28a8dbb7a2e8a8a544856092657b2b06
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 18 deletions.
@@ -3038,15 +3038,75 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
m_assembler.vroundpd_rr(input, dest, RoundingType::TowardZero);
}

void vectorTruncSat(SIMDInfo simdInfo, FPRegisterID input, FPRegisterID dest)
void vectorTruncSat(SIMDInfo simdInfo, FPRegisterID src, FPRegisterID dest, RegisterID scratchGPR, FPRegisterID scratchFPR1, FPRegisterID scratchFPR2)
{
ASSERT(scalarTypeIsFloatingPoint(simdInfo.lane));
ASSERT(simdInfo.signMode != SIMDSignMode::None);
ASSERT(supportsAVX());
ASSERT_UNUSED(simdInfo, simdInfo.signMode == SIMDSignMode::Signed);
ASSERT(simdInfo.lane == SIMDLane::f32x4);
UNUSED_PARAM(input); UNUSED_PARAM(dest); UNUSED_PARAM(simdInfo);
// FIXME: Need to support
// i32x4.trunc_sat_f32x4_s(a: v128) -> v128
// i32x4.trunc_sat_f32x4_u(a: v128) -> v128

// The instruction cvttps2dq only saturates overflows to 0x80000000 and cannot handle NaN.
// However, i32x4.nearest_sat_f32x4_s requires:
// 1. saturate positive-overflow integer to 0x7FFFFFFF
// 2. saturate negative-overflow integer to 0x80000000
// 3. convert NaN or -0 to 0.

m_assembler.vmovaps_rr(src, scratchFPR1); // scratchFPR1 = src
m_assembler.vcmpunordps_rrr(scratchFPR1, scratchFPR1, scratchFPR1); // scratchFPR1 = NaN mask by unordered comparison
m_assembler.vandnps_rrr(src, scratchFPR1, scratchFPR1); // scratchFPR1 = src with NaN lanes cleared

alignas(16) static constexpr float masks[] = {
0x1.0p+31f,
0x1.0p+31f,
0x1.0p+31f,
0x1.0p+31f,
};
move(TrustedImmPtr(masks), scratchGPR); // scratchGPR = minimum positive-overflow integer 0x80000000
m_assembler.vcmpnltps_mrr(0, scratchGPR, scratchFPR1, scratchFPR2); // scratchFPR2 = positive-overflow mask by checking src >= 0x80000000

m_assembler.vcvttps2dq_rr(scratchFPR1, scratchFPR1); // convert scratchFPR1 to integer with overflow saturated to 0x80000000

m_assembler.vpxor_rrr(scratchFPR2, scratchFPR1, dest); // convert positive-overflow lane to 0x7FFFFFFF
}

void vectorTruncSatUnsignedFloat32(FPRegisterID src, FPRegisterID dest, RegisterID scratchGPR, FPRegisterID scratchFPR1, FPRegisterID scratchFPR2)
{
ASSERT(supportsAVX());

// https://github.com/WebAssembly/simd/pull/247
// https://github.com/WebAssembly/relaxed-simd/issues/21

// The instruction cvttps2dq only saturates overflows to 0x80000000 and cannot handle NaN.
// However, i32x4.nearest_sat_f32x4_u requires:
// 1. saturate positive-overflow integer to 0xFFFFFFFF
// 2. saturate negative-overflow integer to 0
// 3. convert NaN or -0 to 0.

m_assembler.vxorps_rrr(scratchFPR1, scratchFPR1, scratchFPR1);
m_assembler.vmaxps_rrr(scratchFPR1, src, dest); // dest = f[lane]x4 = src with NaN and negatives cleared

alignas(16) static constexpr float masks[] = {
2147483647.0f,
2147483647.0f,
2147483647.0f,
2147483647.0f,
};
move(TrustedImmPtr(masks), scratchGPR); // scratchGPR = f[0x80000000]x4

m_assembler.vmovaps_rr(dest, scratchFPR2);
m_assembler.vsubps_mrr(0, scratchGPR, scratchFPR2, scratchFPR2); // scratchFPR2 = f[lane - 0x80000000]x4

m_assembler.vcmpnltps_mrr(0, scratchGPR, scratchFPR2, scratchFPR1); // scratchFPR1 = mask for [lane >= 0xFFFFFFFF]x4

m_assembler.vcvttps2dq_rr(scratchFPR2, scratchFPR2); // scratchFPR2 = i[lane - 0x80000000]x4 with satruated lane 0x80000000 for int32 overflow

m_assembler.vpxor_rrr(scratchFPR1, scratchFPR2, scratchFPR2); // scratchFPR2 = i[lane - 0x80000000]x4 with satruated lane 0x7FFFFFFF for int32 positive-overflow and 0x80000000 for int32 negative-overflow

m_assembler.vpxor_rrr(scratchFPR1, scratchFPR1, scratchFPR1);
m_assembler.vpmaxsd_rrr(scratchFPR1, scratchFPR2, scratchFPR2); // scratchFPR2 = i[lane - 0x80000000]x4 with satruated lane 0x7FFFFFFF for int32 positive-overflow and negatives cleared

m_assembler.vcvttps2dq_rr(dest, dest); // dest = i[lane]x4 with satruated lane 0x80000000 for int32 positive-overflow

m_assembler.vpaddd_rrr(scratchFPR2, dest, dest); // dest = dest + scratchFPR2 = i[lane]x4 with satruated 0xFFFFFFFF for int32 positive-overflow
}

void vectorTruncSatSignedFloat64(FPRegisterID src, FPRegisterID dest, RegisterID scratchGPR, FPRegisterID scratchFPR)
@@ -3254,9 +3314,17 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
m_assembler.cvtdq2ps_rr(input, dest);
}

void vectorConvertUnsigned(FPRegisterID input, FPRegisterID dest, FPRegisterID scratch)
void vectorConvertUnsigned(FPRegisterID src, FPRegisterID dst, FPRegisterID scratch)
{
UNUSED_PARAM(input); UNUSED_PARAM(dest); UNUSED_PARAM(scratch);
ASSERT(supportsAVX());
m_assembler.vpxor_rrr(scratch, scratch, scratch); // clear scratch
m_assembler.vpblendw_i8rrr(0x55, src, scratch, scratch); // i_low = low 16 bits of src
m_assembler.vpsubd_rrr(scratch, src, dst); // i_high = high 16 bits of src
m_assembler.vcvtdq2ps_rr(scratch, scratch); // f_low = convertToF32(i_low)
m_assembler.vpsrld_i8rr(1, dst, dst); // i_half_high = i_high / 2
m_assembler.vcvtdq2ps_rr(dst, dst); // f_half_high = convertToF32(i_half_high)
m_assembler.vaddps_rrr(dst, dst, dst); // dst = f_half_high + f_half_high + f_low
m_assembler.vaddps_rrr(scratch, dst, dst);
}

void vectorConvertLowUnsignedInt32(FPRegisterID input, FPRegisterID dest, RegisterID scratchGPR, FPRegisterID scratchFPR)
@@ -3781,14 +3849,10 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
vectorSplat(SIMDLane::i16x8, scratchGPR, scratchFPR);
m_assembler.vpcmpeqw_rrr(scratchFPR, dest, scratchFPR);
m_assembler.vpxor_rrr(scratchFPR, dest, dest);
} else if (supportsSupplementalSSE3()) {
// FIXME: SSSE3
} else if (supportsSupplementalSSE3())
RELEASE_ASSERT_NOT_REACHED();
} else {
// FIXME: SSE2
else
RELEASE_ASSERT_NOT_REACHED();
}

}

void vectorSwizzle(FPRegisterID a, FPRegisterID b, FPRegisterID dest)
@@ -356,6 +356,7 @@ class X86Assembler {
OP2_BSWAP = 0xC8,
OP2_PSUBUSB_VdqWdq = 0xD8,
OP2_PSUBUSW_VdqWdq = 0xD9,
OP2_VPAND_VxHxWx = 0xDB,
OP2_PADDUSB_VdqWdq = 0xDC,
OP2_PADDUSW_VdqWdq = 0xDD,
OP2_PAVGB_VdqWdq = 0xE0,
@@ -390,6 +391,7 @@ class X86Assembler {
OP2_SQRTPS_VpsWps = 0x51,
OP2_SQRTPD_VpdWpd = 0x51,
OP2_PMADDWD_VdqWdq = 0xF5,
OP2_VPSLLD_VxHxWx = 0x72,
OP2_PCMPEQB_VdqWdq = 0x74,
OP2_PCMPEQW_VdqWdq = 0x75,
OP2_PCMPEQD_VdqWdq = 0x76,
@@ -527,6 +529,7 @@ class X86Assembler {

GROUP11_MOV = 0,

GROUP14_OP_PSLLD = 6,
GROUP14_OP_PSLLQ = 6,
GROUP14_OP_PSRAQ = 4,
GROUP14_OP_PSRLQ = 2,
@@ -2875,7 +2878,7 @@ class X86Assembler {
m_formatter.twoByteOp(OP2_PXOR_VdqWdq, (RegisterID)vd, (RegisterID)vn);
}

void pblendw_rr(uint8_t imm8, XMMRegisterID vn, XMMRegisterID vd)
void pblendw_i8rr(uint8_t imm8, XMMRegisterID vn, XMMRegisterID vd)
{
// https://www.felixcloutier.com/x86/pblendw
// 66 0F 3A 0E /r ib PBLENDW xmm1, xmm2/m128, imm8 | SSE4_1
@@ -4947,6 +4950,14 @@ class X86Assembler {
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_SSE_66, OP2_PADDQ_VdqWdq, (RegisterID)dest, (RegisterID)left, (RegisterID)right);
}

void vsubps_mrr(int offset, RegisterID base, XMMRegisterID left, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/subps
// VEX.128.0F.WIG 5C /r VSUBPS xmm1,xmm2, xmm3/m128
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_00, OP2_SUBPS_VpsWps, (RegisterID)dest, (RegisterID)left, base, offset);
}

void vsubps_rrr(XMMRegisterID right, XMMRegisterID left, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/subps
@@ -5210,6 +5221,26 @@ class X86Assembler {
m_formatter.vexNdsLigWigThreeByteOp(PRE_SSE_66, VexImpliedBytes::ThreeBytesOp38, OP3_PCMPGTQ_VdqWdq, (RegisterID)dest, (RegisterID)b, (RegisterID)a);
}

void vcmpunordps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
vcmpps_rrr(PackedCompareCondition::Unordered, a, b, dest);
}

void vcmpleps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
vcmpps_rrr(PackedCompareCondition::LessThanOrEqual, a, b, dest);
}

void vcmpnltps_mrr(int offset, RegisterID base, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
vcmpps_mrr(PackedCompareCondition::GreaterThanOrEqual, offset, base, xmm2, xmm1);
}

void vcmpltps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
vcmpps_rrr(PackedCompareCondition::LessThan, a, b, dest);
}

void vcmpps_rrr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/cmpps
@@ -5222,6 +5253,15 @@ class X86Assembler {
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

void vcmpps_mrr(PackedCompareCondition condition, int offset, RegisterID base, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/cmpps
// VEX.128.0F.WIG C2 /r ib VCMPPS xmm1, xmm2, xmm3/m128, imm8
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) Imm8
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_00, OP2_CMPPS_VpsWpsIb, (RegisterID)xmm1, (RegisterID)xmm2, base, offset);
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

void vcmppd_rrr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/cmppd.html
@@ -5234,6 +5274,14 @@ class X86Assembler {
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

void vcvttps2dq_rr(XMMRegisterID vn, XMMRegisterID vd)
{
// https://www.felixcloutier.com/x86/cvttps2dq
// VEX.128.F3.0F.WIG 5B /r VCVTTPS2DQ xmm1, xmm2/m128
// A NA ModRM:reg (w) ModRM:r/m (r) NA NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_F3, OP2_CVTDQ2PS_VsdWsd, (RegisterID)vd, (RegisterID)0, (RegisterID)vn);
}

void vcvtdq2ps_rr(XMMRegisterID vn, XMMRegisterID vd)
{
// https://www.felixcloutier.com/x86/cvtdq2ps
@@ -5424,6 +5472,14 @@ class X86Assembler {
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_UNPCKHPD_VpdWpd, (RegisterID)xmm1, (RegisterID)xmm2, (RegisterID)xmm3);
}

void vpand_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/pand
// VEX.128.66.0F.WIG DB /r VPAND xmm1, xmm2, xmm3/m128
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_VPAND_VxHxWx, (RegisterID)xmm1, (RegisterID)xmm2, (RegisterID)xmm3);
}

void vandps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/andps
@@ -5568,6 +5624,15 @@ class X86Assembler {
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PSLLD_VdqWdq, (RegisterID)dest, (RegisterID)input, (RegisterID)shift);
}

void vpslld_i8rr(uint8_t imm8, XMMRegisterID input, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/psllw:pslld:psllq
// VEX.128.66.0F.WIG 72 /6 ib VPSLLD xmm1, xmm2, imm8
// D NA VEX.vvvv (w) ModRM:r/m (r) imm8 NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_VPSLLD_VxHxWx, (RegisterID)GROUP14_OP_PSLLD, (RegisterID)dest, (RegisterID)input);
m_formatter.immediate8(imm8);
}

void vpsllq_rrr(XMMRegisterID shift, XMMRegisterID input, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/psllw:pslld:psllq
@@ -1848,6 +1848,12 @@ arm64: VectorNeg U:G:Ptr, U:F:128, D:F:128
arm64: VectorTruncSat U:G:Ptr, U:F:128, D:F:128
SIMDInfo, Tmp, Tmp

x86_64: VectorTruncSat U:G:Ptr, U:F:128, D:F:128, S:G:64, S:F:128, S:F:128
SIMDInfo, Tmp, Tmp, Tmp, Tmp, Tmp

x86_64: VectorTruncSatUnsignedFloat32 U:F:128, D:F:128, S:G:64, S:F:128, S:F:128
Tmp, Tmp, Tmp, Tmp, Tmp

x86_64: VectorTruncSatSignedFloat64 U:F:128, D:F:128, S:G:64, S:F:128
Tmp, Tmp, Tmp, Tmp

@@ -349,13 +349,22 @@ class AirIRGenerator64 : public AirIRGeneratorBase<AirIRGenerator64, TypedTmp> {
}

if (airOp == B3::Air::VectorTruncSat) {
if (info.lane == SIMDLane::f64x2) {
switch (info.lane) {
case SIMDLane::f64x2:
if (info.signMode == SIMDSignMode::Signed)
append(VectorTruncSatSignedFloat64, v, result, tmpForType(Types::I64), tmpForType(Types::V128));
else
append(VectorTruncSatUnsignedFloat64, v, result, tmpForType(Types::I64), tmpForType(Types::V128));
return { };
case SIMDLane::f32x4:
if (info.signMode == SIMDSignMode::Signed)
append(airOp, Arg::simdInfo(info), v, result, tmpForType(Types::I64), tmpForType(Types::V128), tmpForType(Types::V128));
else
append(VectorTruncSatUnsignedFloat32, v, result, tmpForType(Types::I64), tmpForType(Types::V128), tmpForType(Types::V128));
return { };
default:
RELEASE_ASSERT_NOT_REACHED();
}
return { };
}
}

0 comments on commit a068c52

Please sign in to comment.