Skip to content
Permalink
Browse files
[WebAssembly SIMD] Support vector comparisons on Intel
https://bugs.webkit.org/show_bug.cgi?id=248568
rdar://103089559

Reviewed by Yusuke Suzuki.

Implements support for integer and floating point vector comparisons for the Intel
x86_64 macro assembler.

* Source/JavaScriptCore/assembler/MacroAssemblerX86_64.h:
(JSC::MacroAssemblerX86_64::compareFloatingPointVector):
(JSC::MacroAssemblerX86_64::compareIntegerVector):
* Source/JavaScriptCore/assembler/X86Assembler.h:
(JSC::X86Assembler::vpcmpeqq_rr):
(JSC::X86Assembler::vpcmpgtb_rr):
(JSC::X86Assembler::vpcmpgtw_rr):
(JSC::X86Assembler::vpcmpgtd_rr):
(JSC::X86Assembler::vpcmpgtq_rr):
(JSC::X86Assembler::vcmpps_rr):
(JSC::X86Assembler::vcmppd_rr):
* Source/JavaScriptCore/wasm/WasmAirIRGenerator.cpp:
(JSC::Wasm::AirIRGenerator::addSIMDRelOp):

Canonical link: https://commits.webkit.org/257532@main
  • Loading branch information
David Degazio committed Dec 8, 2022
1 parent 066e5bd commit c6145db418b325ac7054c3bddc544b59ac535163
Show file tree
Hide file tree
Showing 3 changed files with 295 additions and 24 deletions.
@@ -2254,23 +2254,47 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {

void compareFloatingPointVector(DoubleCondition cond, SIMDInfo simdInfo, FPRegisterID left, FPRegisterID right, FPRegisterID dest)
{
RELEASE_ASSERT(supportsAVXForSIMD());
RELEASE_ASSERT(scalarTypeIsFloatingPoint(simdInfo.lane));
UNUSED_PARAM(left); UNUSED_PARAM(right); UNUSED_PARAM(dest);

using PackedCompareCondition = X86Assembler::PackedCompareCondition;

switch (cond) {
case DoubleEqualAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::Equal, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::Equal, left, right, dest);
break;
case DoubleNotEqualOrUnordered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::NotEqual, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::NotEqual, left, right, dest);
break;
case DoubleGreaterThanAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::GreaterThan, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::GreaterThan, left, right, dest);
break;
case DoubleGreaterThanOrEqualAndOrdered:
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::GreaterThanOrEqual, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::GreaterThanOrEqual, left, right, dest);
break;
case DoubleLessThanAndOrdered:
// a < b => b > a
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::LessThan, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::LessThan, left, right, dest);
break;
case DoubleLessThanOrEqualAndOrdered:
// a <= b => b >= a
if (simdInfo.lane == SIMDLane::f32x4)
m_assembler.vcmpps_rr(PackedCompareCondition::LessThanOrEqual, left, right, dest);
else
m_assembler.vcmppd_rr(PackedCompareCondition::LessThanOrEqual, left, right, dest);
break;
default:
RELEASE_ASSERT_NOT_REACHED();
@@ -2279,33 +2303,166 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {

void compareIntegerVector(RelationalCondition cond, SIMDInfo simdInfo, FPRegisterID left, FPRegisterID right, FPRegisterID dest)
{
RELEASE_ASSERT(supportsAVXForSIMD());
RELEASE_ASSERT(scalarTypeIsIntegral(simdInfo.lane));
UNUSED_PARAM(left); UNUSED_PARAM(right); UNUSED_PARAM(dest);

switch (cond) {
case Equal:
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpcmpeqb_rr(left, right, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpcmpeqw_rr(left, right, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpcmpeqd_rr(left, right, dest);
break;
case SIMDLane::i64x2:
m_assembler.vpcmpeqq_rr(left, right, dest);
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case NotEqual:
// NotEqual comparisons are implemented by negating Equal on Intel, which should be
// handled before we ever reach this point.
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Shouldn't emit integer vector NotEqual comparisons directly.");
break;
case Above:
// Above comparisons are implemented by negating BelowOrEqual on Intel, which should be
// handled before we ever reach this point.
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Shouldn't emit integer vector Above comparisons directly.");
break;
case AboveOrEqual:
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpmaxub_rr(left, right, dest);
m_assembler.vpcmpeqb_rr(left, dest, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpmaxuw_rr(left, right, dest);
m_assembler.vpcmpeqw_rr(left, dest, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpmaxud_rr(left, right, dest);
m_assembler.vpcmpeqd_rr(left, dest, dest);
break;
case SIMDLane::i64x2:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("i64x2 unsigned comparisons are not supported.");
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case Below:
// a < b => b > a
// Below comparisons are implemented by negating AboveOrEqual on Intel, which should be
// handled before we ever reach this point.
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Shouldn't emit integer vector Below comparisons directly.");
break;
case BelowOrEqual:
// a <= b => b >= a
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpminub_rr(left, right, dest);
m_assembler.vpcmpeqb_rr(left, dest, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpminuw_rr(left, right, dest);
m_assembler.vpcmpeqw_rr(left, dest, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpminud_rr(left, right, dest);
m_assembler.vpcmpeqd_rr(left, dest, dest);
break;
case SIMDLane::i64x2:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("i64x2 unsigned comparisons are not supported.");
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case GreaterThan:
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpcmpgtb_rr(left, right, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpcmpgtw_rr(left, right, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpcmpgtd_rr(left, right, dest);
break;
case SIMDLane::i64x2:
m_assembler.vpcmpgtq_rr(left, right, dest);
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case GreaterThanOrEqual:
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpmaxsb_rr(left, right, dest);
m_assembler.vpcmpeqb_rr(left, dest, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpmaxsw_rr(left, right, dest);
m_assembler.vpcmpeqw_rr(left, dest, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpmaxsd_rr(left, right, dest);
m_assembler.vpcmpeqd_rr(left, dest, dest);
break;
case SIMDLane::i64x2:
// Intel doesn't support 64-bit packed maximum/minimum without AVX512, so this condition should have been transformed
// into a negated LessThan prior to reaching the macro assembler.
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Shouldn't emit integer vector GreaterThanOrEqual comparisons directly.");
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case LessThan:
// a < b => b > a
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpcmpgtb_rr(right, left, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpcmpgtw_rr(right, left, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpcmpgtd_rr(right, left, dest);
break;
case SIMDLane::i64x2:
m_assembler.vpcmpgtq_rr(right, left, dest);
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
case LessThanOrEqual:
// a <= b => b >= a
switch (simdInfo.lane) {
case SIMDLane::i8x16:
m_assembler.vpminsb_rr(left, right, dest);
m_assembler.vpcmpeqb_rr(left, dest, dest);
break;
case SIMDLane::i16x8:
m_assembler.vpminsw_rr(left, right, dest);
m_assembler.vpcmpeqw_rr(left, dest, dest);
break;
case SIMDLane::i32x4:
m_assembler.vpminsd_rr(left, right, dest);
m_assembler.vpcmpeqd_rr(left, dest, dest);
break;
case SIMDLane::i64x2:
// Intel doesn't support 64-bit packed maximum/minimum without AVX512, so this condition should have been transformed
// into a negated GreaterThan prior to reaching the macro assembler.
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Shouldn't emit integer vector LessThanOrEqual comparisons directly.");
break;
default:
RELEASE_ASSERT_NOT_REACHED_WITH_MESSAGE("Unsupported SIMD lane for comparison");
}
break;
default:
RELEASE_ASSERT_NOT_REACHED();
@@ -297,7 +297,6 @@ class X86Assembler {
OP2_PSHUFHW_VdqWdqIb = 0x70,
OP2_PSLLQ_UdqIb = 0x73,
OP2_PSRLQ_UdqIb = 0x73,
OP2_PCMPEQW_VdqWdq = 0x75,
OP2_MOVD_EdVd = 0x7E,
OP2_JCC_rel32 = 0x80,
OP_SETCC = 0x90,
@@ -327,19 +326,15 @@ class X86Assembler {
OP2_BSWAP = 0xC8,
OP2_PSUBUSB_VdqWdq = 0xD8,
OP2_PSUBUSW_VdqWdq = 0xD9,
OP2_PMINUB_VdqWdq = 0xDA,
OP2_PADDUSB_VdqWdq = 0xDC,
OP2_PADDUSW_VdqWdq = 0xDD,
OP2_PMAXUB_VdqWdq = 0xDE,
OP2_PAVGB_VdqWdq = 0xE0,
OP2_PAVGW_VdqWdq = 0xE3,
OP2_PSUBSB_VdqWdq = 0xE8,
OP2_PSUBSW_VdqWdq = 0xE9,
OP2_PMINSW_VdqWdq = 0xEA,
OP2_POR_VdqWdq = 0XEB,
OP2_PADDSB_VdqWdq = 0xEC,
OP2_PADDSW_VdqWdq = 0xED,
OP2_PMAXSW_VdqWdq = 0xEE,
OP2_PXOR_VdqWdq = 0xEF,
OP2_PADDB_VdqWdq = 0xFC,
OP2_PADDW_VdqWdq = 0xFD,
@@ -360,7 +355,19 @@ class X86Assembler {
OP2_DIVPD_VpdWpd = 0x5E,
OP2_SQRTPS_VpsWps = 0x51,
OP2_SQRTPD_VpdWpd = 0x51,
OP2_PMADDWD_VdqWdq = 0xF5
OP2_PMADDWD_VdqWdq = 0xF5,
OP2_PCMPEQB_VdqWdq = 0x74,
OP2_PCMPEQW_VdqWdq = 0x75,
OP2_PCMPEQD_VdqWdq = 0x76,
OP2_PCMPGTB_VdqWdq = 0x64,
OP2_PCMPGTW_VdqWdq = 0x65,
OP2_PCMPGTD_VdqWdq = 0x66,
OP2_CMPPS_VpsWpsIb = 0xC2,
OP2_CMPPD_VpdWpdIb = 0xC2,
OP2_PMAXSW_VdqWdq = 0xEE,
OP2_PMAXUB_VdqWdq = 0xDE,
OP2_PMINSW_VdqWdq = 0xEA,
OP2_PMINUB_VdqWdq = 0xDA
} TwoByteOpcodeID;

typedef enum {
@@ -377,21 +384,23 @@ class X86Assembler {
OP3_INSERTPS_VpsUpsIb = 0x21,
OP3_PINSRB = 0x20,
OP3_PINSRD = 0x22,
OP3_PMINSB_VdqWdq = 0x38,
OP3_PMINSD_VdqWdq = 0x39,
OP3_PMINUW_VdqWdq = 0x3A,
OP3_PMINUD_VdqWdq = 0x3B,
OP3_PMAXSB_VdqWdq = 0x3C,
OP3_PMAXSD_VdqWdq = 0x3D,
OP3_PMAXUW_VdqWdq = 0x3E,
OP3_PMAXUD_VdqWdq = 0x3F,
OP3_BLENDVPD_VpdWpdXMM0 = 0x4B,
OP3_LFENCE = 0xE8,
OP3_MFENCE = 0xF0,
OP3_SFENCE = 0xF8,
OP3_ROUNDPS_VpsWpsIb = 0x08,
OP3_ROUNDPD_VpdWpdIb = 0x09,
OP3_PMULLD_VdqWdq = 0x40
OP3_PMULLD_VdqWdq = 0x40,
OP3_PCMPEQQ_VdqWdq = 0x29,
OP3_PCMPGTQ_VdqWdq = 0x37,
OP3_PMAXSB_VdqWdq = 0x3C,
OP3_PMAXSD_VdqWdq = 0x3D,
OP3_PMAXUW_VdqWdq = 0x3E,
OP3_PMAXUD_VdqWdq = 0x3F,
OP3_PMINSB_VdqWdq = 0x38,
OP3_PMINSD_VdqWdq = 0x39,
OP3_PMINUW_VdqWdq = 0x3A,
OP3_PMINUD_VdqWdq = 0x3B
} ThreeByteOpcodeID;

struct VexPrefix {
@@ -3223,6 +3232,70 @@ class X86Assembler {
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_OPERAND_SIZE, OP2_PMADDWD_VdqWdq, (RegisterID)dest, (RegisterID)a, (RegisterID)b);
}

void vpcmpeqb_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_OPERAND_SIZE, OP2_PCMPEQB_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpeqd_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_OPERAND_SIZE, OP2_PCMPEQD_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpeqq_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigThreeByteOp(PRE_OPERAND_SIZE, VexImpliedBytes::ThreeBytesOp38, OP3_PCMPEQQ_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpgtb_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigTwoByteOp(PRE_OPERAND_SIZE, OP2_PCMPGTB_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpgtw_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigTwoByteOp(PRE_OPERAND_SIZE, OP2_PCMPGTW_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpgtd_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigTwoByteOp(PRE_OPERAND_SIZE, OP2_PCMPGTD_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

void vpcmpgtq_rr(XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
m_formatter.vexNdsLigWigThreeByteOp(PRE_OPERAND_SIZE, VexImpliedBytes::ThreeBytesOp38, OP3_PCMPGTQ_VdqWdq, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
}

enum class PackedCompareCondition : uint8_t {
Equal = 0,
LessThan = 1,
LessThanOrEqual = 2,
Unordered = 3,
NotEqual = 4,
GreaterThanOrEqual = 5, // Also called "NotLessThan" in the Intel manual
GreaterThan = 6, // Also called "NotLessThanOrEqual" in the Intel manual
Ordered = 7
};

void vcmpps_rr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
if (condition == PackedCompareCondition::Equal || condition == PackedCompareCondition::NotEqual)
m_formatter.vexNdsLigWigCommutativeTwoByteOp((OneByteOpcodeID)0, OP2_CMPPS_VpsWpsIb, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
else
m_formatter.vexNdsLigWigTwoByteOp((OneByteOpcodeID)0, OP2_CMPPS_VpsWpsIb, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

void vcmppd_rr(PackedCompareCondition condition, XMMRegisterID a, XMMRegisterID b, XMMRegisterID dest)
{
if (condition == PackedCompareCondition::Equal || condition == PackedCompareCondition::NotEqual)
m_formatter.vexNdsLigWigCommutativeTwoByteOp(PRE_OPERAND_SIZE, OP2_CMPPD_VpdWpdIb, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
else
m_formatter.vexNdsLigWigTwoByteOp(PRE_OPERAND_SIZE, OP2_CMPPD_VpdWpdIb, (RegisterID)a, (RegisterID)b, (RegisterID)dest);
m_formatter.immediate8(static_cast<uint8_t>(condition));
}

void movl_rr(RegisterID src, RegisterID dst)
{
m_formatter.oneByteOp(OP_MOV_EvGv, src, dst);

0 comments on commit c6145db

Please sign in to comment.