Skip to content
Permalink
Browse files
[WebAssembly SIMD] Emulate 8-bit shift instructions and i8x16.popcnt …
…on Intel

https://bugs.webkit.org/show_bug.cgi?id=248995
rdar://103159176

Reviewed by Yusuke Suzuki.

Adds implementations for 8-bit shift and popcount SIMD instructions on Intel,
along with i64x2 arithmetic right shift. With this patch, our support for
WebAssembly SIMD on Intel is feature-complete for BBQ Air, and we pass all
spec tests for the Air backend.

* Source/JavaScriptCore/assembler/MacroAssemblerX86_64.h:
(JSC::MacroAssemblerX86_64::compareFloatingPointVector):
(JSC::MacroAssemblerX86_64::vectorAndnot):
(JSC::MacroAssemblerX86_64::vectorTruncSatSignedFloat64):
(JSC::MacroAssemblerX86_64::vectorUshl8):
(JSC::MacroAssemblerX86_64::vectorUshr8):
(JSC::MacroAssemblerX86_64::vectorSshr8):
(JSC::MacroAssemblerX86_64::vectorPopcnt): Deleted.
* Source/JavaScriptCore/assembler/X86Assembler.h:
(JSC::X86Assembler::vpunpcklbw_rrr):
(JSC::X86Assembler::vpunpckhbw_rrr):
(JSC::X86Assembler::vcmpps_rrr):
(JSC::X86Assembler::vcmppd_rrr):
(JSC::X86Assembler::vpsllw_i8rr):
(JSC::X86Assembler::vpslld_i8rr):
* Source/JavaScriptCore/b3/air/AirLowerMacros.cpp:
(JSC::B3::Air::lowerMacros):
* Source/JavaScriptCore/b3/air/AirOpcode.opcodes:
* Source/JavaScriptCore/wasm/WasmAirIRGenerator64.cpp:
(JSC::Wasm::AirIRGenerator64::addSIMDV_V):
(JSC::Wasm::AirIRGenerator64::addSIMDRelOp):
(JSC::Wasm::AirIRGenerator64::addSIMDShift):

Canonical link: https://commits.webkit.org/258089@main
  • Loading branch information
David Degazio committed Dec 19, 2022
1 parent f25b969 commit 7d8a35c36444a89da6244c8122e7bd12780ab3f6
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 86 deletions.
@@ -2461,39 +2461,39 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
switch (cond) {
case DoubleEqualAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::Equal, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::EqualAndOrdered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::Equal, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::EqualAndOrdered, right, left, dest);
break;
case DoubleNotEqualOrUnordered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::NotEqual, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::NotEqualOrUnordered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::NotEqual, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::NotEqualOrUnordered, right, left, dest);
break;
case DoubleGreaterThanAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::GreaterThan, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::GreaterThanAndOrdered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::GreaterThan, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::GreaterThanAndOrdered, right, left, dest);
break;
case DoubleGreaterThanOrEqualAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::GreaterThanOrEqual, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::GreaterThanOrEqualAndOrdered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::GreaterThanOrEqual, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::GreaterThanOrEqualAndOrdered, right, left, dest);
break;
case DoubleLessThanAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::LessThan, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::LessThanAndOrdered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::LessThan, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::LessThanAndOrdered, right, left, dest);
break;
case DoubleLessThanOrEqualAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rrr(PackedCompareCondition::LessThanOrEqual, right, left, dest);
m_assembler.vcmpps_rrr(PackedCompareCondition::LessThanOrEqualAndOrdered, right, left, dest);
else
m_assembler.vcmppd_rrr(PackedCompareCondition::LessThanOrEqual, right, left, dest);
m_assembler.vcmppd_rrr(PackedCompareCondition::LessThanOrEqualAndOrdered, right, left, dest);
break;
default:
RELEASE_ASSERT_NOT_REACHED();
@@ -2914,20 +2914,28 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
{
RELEASE_ASSERT(supportsAVX());
ASSERT(scalarTypeIsFloatingPoint(simdInfo.lane));
// When comparing min(0.0, -0.0), the WebAssembly semantics of Pmin say we should return 0.0, since
// Pmin is defined as right < left ? right : left. However, the vminps instruction breaks ties towards the second
// source operand - essentially left < right ? left : right. So we reverse the usual operand order for the
// instruction.
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vminps_rrr(right, left, dest);
m_assembler.vminps_rrr(left, right, dest);
else
m_assembler.vminpd_rrr(right, left, dest);
m_assembler.vminpd_rrr(left, right, dest);
}

void vectorPmax(SIMDInfo simdInfo, FPRegisterID left, FPRegisterID right, FPRegisterID dest)
{
RELEASE_ASSERT(supportsAVX());
ASSERT(scalarTypeIsFloatingPoint(simdInfo.lane));
// When comparing max(0.0, -0.0), the WebAssembly semantics of Pmax say we should return 0.0, since
// Pmax is defined as right > left ? right : left. However, the vmaxps instruction breaks ties towards the second
// source operand - essentially left > right ? left : right. So we reverse the usual operand order for the
// instruction.
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vmaxps_rrr(right, left, dest);
m_assembler.vmaxps_rrr(left, right, dest);
else
m_assembler.vmaxpd_rrr(right, left, dest);
m_assembler.vmaxpd_rrr(left, right, dest);
}

void vectorBitwiseSelect(FPRegisterID left, FPRegisterID right, FPRegisterID inputBitsAndDest)
@@ -2946,7 +2954,10 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
{
RELEASE_ASSERT(supportsAVX());
RELEASE_ASSERT(simdInfo.lane == SIMDLane::v128);
m_assembler.vandnps_rrr(right, left, dest);
// WebAssembly v128.andnot is equivalent to (v128.and left (v128.not right)). However, the Intel
// vandnps instruction negates the first source operand, essentially (v128.and (v128.not left) right).
// To achieve correct WebAssembly semantics, we provide left and right in reversed order here.
m_assembler.vandnps_rrr(left, right, dest);
}

void vectorOr(SIMDInfo simdInfo, FPRegisterID left, FPRegisterID right, FPRegisterID dest)
@@ -3014,12 +3025,6 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
}
}

void vectorPopcnt(SIMDInfo simdInfo, FPRegisterID input, FPRegisterID dest)
{
ASSERT(simdInfo.lane == SIMDLane::i8x16);
UNUSED_PARAM(input); UNUSED_PARAM(dest); UNUSED_PARAM(simdInfo);
}

using RoundingType = X86Assembler::RoundingType;

void vectorCeil(SIMDInfo simdInfo, FPRegisterID input, FPRegisterID dest)
@@ -3134,7 +3139,7 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {

using PackedCompareCondition = X86Assembler::PackedCompareCondition;

m_assembler.vcmppd_rrr(PackedCompareCondition::Equal, src, src, scratchFPR);
m_assembler.vcmppd_rrr(PackedCompareCondition::EqualAndOrdered, src, src, scratchFPR);
move(TrustedImmPtr(masks), scratchGPR);
m_assembler.vandpd_mrr(0, scratchGPR, scratchFPR, scratchFPR);
m_assembler.vminpd_rrr(scratchFPR, src, dest);
@@ -3389,6 +3394,75 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
}
}

void vectorUshl8(FPRegisterID input, FPRegisterID shift, FPRegisterID dest, FPRegisterID tmp1, FPRegisterID tmp2)
{
RELEASE_ASSERT(supportsAVX());

// Unpack and zero-extend low input bytes.
m_assembler.vxorps_rrr(tmp2, tmp2, tmp2);
m_assembler.vpunpcklbw_rrr(input, tmp2, tmp1);

// Word-wise shift low input bytes into tmp1.
m_assembler.vpsllw_rrr(shift, tmp1, tmp1);

// Unpack and zero-extend high input bytes.
m_assembler.vpunpckhbw_rrr(input, tmp2, tmp2);

// Word-wise shift high input bytes into tmp2.
m_assembler.vpsllw_rrr(shift, tmp2, tmp2);

// Mask away higher bits of left-shifted results.
m_assembler.vpsllw_i8rr(8, tmp1, tmp1);
m_assembler.vpsllw_i8rr(8, tmp2, tmp2);
m_assembler.vpsrlw_i8rr(8, tmp1, tmp1);
m_assembler.vpsrlw_i8rr(8, tmp2, tmp2);

// Pack low and high results into destination.
m_assembler.vpackuswb_rrr(tmp2, tmp1, dest);
}

void vectorUshr8(FPRegisterID input, FPRegisterID shift, FPRegisterID dest, FPRegisterID tmp1, FPRegisterID tmp2)
{
RELEASE_ASSERT(supportsAVX());

// Unpack and zero-extend low input bytes.
m_assembler.vxorps_rrr(tmp2, tmp2, tmp2);
m_assembler.vpunpcklbw_rrr(input, tmp2, tmp1);

// Word-wise shift low input bytes into tmp1.
m_assembler.vpsrlw_rrr(shift, tmp1, tmp1);

// Unpack and zero-extend high input bytes.
m_assembler.vpunpckhbw_rrr(input, tmp2, tmp2);

// Word-wise shift high input bytes into tmp2.
m_assembler.vpsrlw_rrr(shift, tmp2, tmp2);

// Pack low and high results into destination.
m_assembler.vpackuswb_rrr(tmp2, tmp1, dest);
}

void vectorSshr8(FPRegisterID input, FPRegisterID shift, FPRegisterID dest, FPRegisterID tmp1, FPRegisterID tmp2)
{
RELEASE_ASSERT(supportsAVX());

// Unpack and zero-extend low input bytes.
m_assembler.vpmovsxbw_rr(input, tmp1);

// Word-wise shift low input bytes into tmp1.
m_assembler.vpsraw_rrr(shift, tmp1, tmp1);

// Unpack and sign-extend high input bytes.
m_assembler.vpshufd_i8rr(0b00001110, input, tmp2);
m_assembler.vpmovsxbw_rr(tmp2, tmp2);

// Word-wise shift high input bytes into tmp2.
m_assembler.vpsraw_rrr(shift, tmp2, tmp2);

// Pack low and high results into destination.
m_assembler.vpacksswb_rrr(tmp2, tmp1, dest);
}

void vectorSshr8(SIMDInfo simdInfo, FPRegisterID input, TrustedImm32 shift, FPRegisterID dest)
{
RELEASE_ASSERT(scalarTypeIsIntegral(simdInfo.lane));
@@ -309,8 +309,10 @@ class X86Assembler {
OP2_DIVSD_VsdWsd = 0x5E,
OP2_MAXPS_VpsWps = 0x5F,
OP2_MAXPD_VpdWpd = 0x5F,
OP2_PUNPCKLBW_VdqWdq = 0x60,
OP2_PACKSSWB_VdqWdq = 0x63,
OP2_PACKUSWB_VdqWdq = 0x67,
OP2_PUNPCKHBW_VdqWdq = 0x68,
OP2_PACKSSDW_VdqWdq = 0x6B,
OP2_PUNPCKLQDQ_VdqWdq = 0x6C,
OP2_MOVD_VdEd = 0x6E,
@@ -319,8 +321,10 @@ class X86Assembler {
OP2_PSHUFD_VdqWdqIb = 0x70,
OP2_PSHUFLW_VdqWdqIb = 0x70,
OP2_PSHUFHW_VdqWdqIb = 0x70,
OP2_PSLLW_UdqIb = 0x71,
OP2_PSRLW_UdqIb = 0x71,
OP2_PSRAW_UdqIb = 0x71,
OP2_PSLLD_UdqIb = 0x72,
OP2_PSRLD_UdqIb = 0x72,
OP2_PSRAD_UdqIb = 0x72,
OP2_PSLLQ_UdqIb = 0x73,
@@ -2910,14 +2914,22 @@ class X86Assembler {
};

enum class PackedCompareCondition : uint8_t {
Equal = 0,
LessThan = 1,
LessThanOrEqual = 2,
EqualAndOrdered = 0,
LessThanAndOrdered = 1,
LessThanOrEqualAndOrdered = 2,
Unordered = 3,
NotEqual = 4,
GreaterThanOrEqual = 5, // Also called "NotLessThan" in the Intel manual
GreaterThan = 6, // Also called "NotLessThanOrEqual" in the Intel manual
Ordered = 7
NotEqualOrUnordered = 4,
NotLessThanOrUnordered = 5,
NotLessThanOrEqualOrUnordered = 6,
Ordered = 7,
EqualOrUnordered = 8,
NotGreaterThanOrEqualOrUnordered = 9,
NotGreaterThanOrUnordered = 10,
False = 11,
NotEqualAndOrdered = 12,
GreaterThanOrEqualAndOrdered = 13,
GreaterThanAndOrdered = 14,
True = 15
};

void cvtdq2ps_rr(XMMRegisterID vn, XMMRegisterID vd)
@@ -4481,6 +4493,22 @@ class X86Assembler {
m_formatter.vexNdsLigWigThreeByteOp(PRE_SSE_66, VexImpliedBytes::ThreeBytesOp38, OP3_VBROADCASTSS_VxWd, (RegisterID)dst, (RegisterID)0, base, offset);
}

void vpunpcklbw_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
// VEX.128.66.0F.WIG 60/r VPUNPCKLBW xmm1, xmm2, xmm3/m128
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PUNPCKLBW_VdqWdq, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
}

void vpunpckhbw_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
// VEX.128.66.0F.WIG 68/r VPUNPCKHBW xmm1, xmm2, xmm3/m128
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PUNPCKHBW_VdqWdq, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
}

void vpunpcklqdq_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
@@ -5228,28 +5256,36 @@ class X86Assembler {

void vcmpleps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
vcmpps_rrr(PackedCompareCondition::LessThanOrEqual, a, b, dest);
vcmpps_rrr(PackedCompareCondition::LessThanOrEqualAndOrdered, a, b, dest);
}

void vcmpnltps_mrr(int offset, RegisterID base, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
vcmpps_mrr(PackedCompareCondition::GreaterThanOrEqual, offset, base, xmm2, xmm1);
vcmpps_mrr(PackedCompareCondition::GreaterThanOrEqualAndOrdered, offset, base, xmm2, xmm1);
}

void vcmpltps_rrr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
vcmpps_rrr(PackedCompareCondition::LessThan, a, b, dest);
vcmpps_rrr(PackedCompareCondition::LessThanAndOrdered, a, b, dest);
}

void vcmpps_rrr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/cmpps
// VEX.128.0F.WIG C2 /r ib VCMPPS xmm1, xmm2, xmm3/m128, imm8
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) Imm8
if (condition == PackedCompareCondition::Equal || condition == PackedCompareCondition::NotEqual)
switch (condition) {
case PackedCompareCondition::EqualAndOrdered:
case PackedCompareCondition::NotEqualOrUnordered:
case PackedCompareCondition::Unordered:
case PackedCompareCondition::Ordered:
case PackedCompareCondition::EqualOrUnordered:
case PackedCompareCondition::NotEqualAndOrdered:
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_SSE_00, OP2_CMPPS_VpsWpsIb, (RegisterID)dest, (RegisterID)b, (RegisterID)a);
else
break;
default:
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_00, OP2_CMPPS_VpsWpsIb, (RegisterID)dest, (RegisterID)b, (RegisterID)a);
}
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

@@ -5264,13 +5300,21 @@ class X86Assembler {

void vcmppd_rrr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/cmppd.html
// https://www.felixcloutier.com/x86/cmppd
// VEX.128.66.0F.WIG C2 /r ib VCMPPD xmm1, xmm2, xmm3/m128, imm8
// B NA ModRM:reg (w) VEX.vvvv (r) ModRM:r/m (r) Imm8
if (condition == PackedCompareCondition::Equal || condition == PackedCompareCondition::NotEqual)
switch (condition) {
case PackedCompareCondition::EqualAndOrdered:
case PackedCompareCondition::NotEqualOrUnordered:
case PackedCompareCondition::Unordered:
case PackedCompareCondition::Ordered:
case PackedCompareCondition::EqualOrUnordered:
case PackedCompareCondition::NotEqualAndOrdered:
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_SSE_66, OP2_CMPPD_VpdWpdIb, (RegisterID)dest, (RegisterID)b, (RegisterID)a);
else
break;
default:
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_CMPPD_VpdWpdIb, (RegisterID)dest, (RegisterID)b, (RegisterID)a);
}
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

@@ -5315,6 +5359,24 @@ class X86Assembler {
m_formatter.immediate8(imm8);
}

void vpsllw_i8rr(uint8_t shift, XMMRegisterID src, XMMRegisterID dst)
{
// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
// VEX.128.66.0F.WIG 71 /6 ib VPSLLW xmm1, xmm2, imm8
// D NA VEX.vvvv (w) ModRM:r/m (r) imm8 NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PSLLW_UdqIb, (RegisterID)GROUP14_OP_PSLLQ, (RegisterID)dst, (RegisterID)src);
m_formatter.immediate8(shift);
}

void vpslld_i8rr(uint8_t shift, XMMRegisterID src, XMMRegisterID dst)
{
// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
// VEX.128.66.0F.WIG 72 /6 ib VPSLLD xmm1, xmm2, imm8
// D NA VEX.vvvv (w) ModRM:r/m (r) imm8 NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PSLLD_UdqIb, (RegisterID)GROUP14_OP_PSLLQ, (RegisterID)dst, (RegisterID)src);
m_formatter.immediate8(shift);
}

void vpsrlw_i8rr(uint8_t shift, XMMRegisterID src, XMMRegisterID dst)
{
// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
@@ -5624,15 +5686,6 @@ class X86Assembler {
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PSLLD_VdqWdq, (RegisterID)dest, (RegisterID)input, (RegisterID)shift);
}

void vpslld_i8rr(uint8_t imm8, XMMRegisterID input, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/psllw:pslld:psllq
// VEX.128.66.0F.WIG 72 /6 ib VPSLLD xmm1, xmm2, imm8
// D NA VEX.vvvv (w) ModRM:r/m (r) imm8 NA
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_VPSLLD_VxHxWx, (RegisterID)GROUP14_OP_PSLLD, (RegisterID)dest, (RegisterID)input);
m_formatter.immediate8(imm8);
}

void vpsllq_rrr(XMMRegisterID shift, XMMRegisterID input, XMMRegisterID dest)
{
// https://www.felixcloutier.com/x86/psllw:pslld:psllq

0 comments on commit 7d8a35c

Please sign in to comment.