Skip to content
Permalink
Browse files
[SIMD] Intel support for extended integer arithmetic and fix bitmask …
…operation

https://bugs.webkit.org/show_bug.cgi?id=249042
rdar://103192622

Reviewed by Yusuke Suzuki.

Add support for extended integer arithmetic operations and fix `i16x8.bitmask`.
https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#extended-integer-arithmetic

* Source/JavaScriptCore/assembler/MacroAssemblerX86_64.h:
(JSC::MacroAssemblerX86_64::vectorBitmask):
(JSC::MacroAssemblerX86_64::vectorExtaddPairwise):
* Source/JavaScriptCore/assembler/X86Assembler.h:
(JSC::X86Assembler::vshufps_rrr):
(JSC::X86Assembler::vblendvpd_rrrr):
(JSC::X86Assembler::vcmppd_rrr):
(JSC::X86Assembler::vmovdqa_rr):
(JSC::X86Assembler::vpmaddubsw_rrr):
(JSC::X86Assembler::vpsrld_i8rr):
(JSC::X86Assembler::vpblendw_i8rrr):
(JSC::X86Assembler::X86InstructionFormatter::SingleInstructionBufferWriter::memoryModRM):
* Source/JavaScriptCore/b3/air/AirOpcode.opcodes:
* Source/JavaScriptCore/wasm/WasmAirIRGenerator.cpp:
(JSC::Wasm::AirIRGenerator::addSIMDV_V):

Canonical link: https://commits.webkit.org/257657@main
  • Loading branch information
hyjorc1 authored and Yijia Huang committed Dec 10, 2022
1 parent 2b352bf commit ddd1dc16c98a173f379e7e36af45486fc9ae13c7
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 26 deletions.
@@ -3334,8 +3334,8 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
break;
case SIMDLane::i16x8:
m_assembler.vpxor_rrr(tmp, tmp, tmp);
m_assembler.vpacksswb_rrr(vec, tmp, tmp);
m_assembler.vpmovmskb_rr(vec, dest);
m_assembler.vpacksswb_rrr(tmp, vec, tmp);
m_assembler.vpmovmskb_rr(tmp, dest);
break;
case SIMDLane::i32x4:
m_assembler.vmovmskps_rr(vec, dest);
@@ -3348,7 +3348,40 @@ class MacroAssemblerX86_64 : public MacroAssemblerX86Common {
}
}

void vectorExtaddPairwise(SIMDInfo simdInfo, FPRegisterID vec, FPRegisterID dest) { UNUSED_PARAM(simdInfo); UNUSED_PARAM(vec); UNUSED_PARAM(dest); }
void vectorExtaddPairwise(SIMDInfo simdInfo, FPRegisterID vec, FPRegisterID dest, RegisterID scratchGPR, FPRegisterID scratchFPR)
{
RELEASE_ASSERT(supportsAVXForSIMD());

// https://github.com/WebAssembly/simd/pull/380
move(TrustedImm64(1), scratchGPR);
switch (simdInfo.lane) {
case SIMDLane::i8x16:
vectorSplat8(scratchGPR, scratchFPR);
if (simdInfo.signMode == SIMDSignMode::Signed) {
m_assembler.vmovdqa_rr(scratchFPR, scratchFPR);
m_assembler.vpmaddubsw_rrr(vec, scratchFPR, dest);
} else
m_assembler.vpmaddubsw_rrr(scratchFPR, vec, dest);
return;
case SIMDLane::i16x8:
vectorSplat16(scratchGPR, scratchFPR);
if (simdInfo.signMode == SIMDSignMode::Signed)
m_assembler.vpmaddwd_rrr(vec, scratchFPR, dest);
else
RELEASE_ASSERT_NOT_REACHED();
return;
default:
RELEASE_ASSERT_NOT_REACHED();
}
}

void vectorExtaddPairwiseUnsignedInt16(FPRegisterID src, FPRegisterID dest, FPRegisterID scratch1, FPRegisterID scratch2)
{
RELEASE_ASSERT(supportsAVXForSIMD());
m_assembler.vpsrld_i8rr(16, src, scratch1);
m_assembler.vpblendw_i8rrr(0xAA, scratch1, src, scratch2);
m_assembler.vpaddd_rrr(scratch1, scratch2, dest);
}

void vectorAvgRound(SIMDInfo simdInfo, FPRegisterID a, FPRegisterID b, FPRegisterID dest)
{
@@ -304,6 +304,7 @@ class X86Assembler {
OP2_PACKSSDW_VdqWdq = 0x6B,
OP2_PUNPCKLQDQ_VdqWdq = 0x6C,
OP2_MOVD_VdEd = 0x6E,
OP2_MOVDQA_VdqWdq = 0x6F,
OP2_PSHUFD_VdqWdqIb = 0x70,
OP2_PSHUFLW_VdqWdqIb = 0x70,
OP2_PSHUFHW_VdqWdqIb = 0x70,
@@ -359,6 +360,7 @@ class X86Assembler {
OP2_PSUBD_VdqWdq = 0xFA,
OP2_PSUBQ_VdqWdq = 0xFB,
OP2_PMULLW_VdqWdq = 0xD5,
OP2_PMOVMSKB_GdqpUdq = 0xD7,
OP2_ADDPS_VpsWps = 0x58,
OP2_ADDPD_VpdWpd = 0x58,
OP2_SUBPS_VpsWps = 0x5C,
@@ -398,6 +400,7 @@ class X86Assembler {

typedef enum {
OP3_PSHUFB_VdqWdq = 0x00,
OP3_PMADDUBSW_VpdWpd = 0x04,
OP3_ROUNDPD_MbVdqIb = 0x09,
OP3_ROUNDSS_VssWssIb = 0x0A,
OP3_ROUNDSD_VsdWsdIb = 0x0B,
@@ -2693,7 +2696,8 @@ class X86Assembler {
// VEX.128.0F.WIG C6 /r ib VSHUFPS xmm1, xmm2, xmm3/m128, imm8
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_SHUFPS_VpdWpdIb, controlBits, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_SHUFPS_VpdWpdIb, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.immediate8(controlBits);
}

void shufpd_rr(uint8_t controlBits, XMMRegisterID vn, XMMRegisterID vd)
@@ -3157,7 +3161,8 @@ class X86Assembler {
// VEX.128.66.0F3A.W0 4B /r /is4 VBLENDVPD xmm1, xmm2, xmm3/m128, xmm4
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::ThreeBytesOp3A, isW1, OP3_BLENDVPD_VpdWpdXMM0, xmm4, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::ThreeBytesOp3A, isW1, OP3_BLENDVPD_VpdWpdXMM0, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.immediate8(static_cast<uint8_t>(xmm4) << 4); // imm8[7:4]
}

void pblendw_rr(uint8_t imm8, XMMRegisterID vn, XMMRegisterID vd)
@@ -3444,7 +3449,8 @@ class X86Assembler {
// VEX.128.66.0F.WIG C2 /r ib VCMPPD xmm1, xmm2, xmm3/m128, imm8
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_CMPPD_VpdWpdIb, imm8, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_CMPPD_VpdWpdIb, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.immediate8(imm8);
}

void vcmpeqpd_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
@@ -3764,6 +3770,44 @@ class X86Assembler {
m_formatter.vexNdsLigWigTwoByteOp(PRE_SSE_66, OP2_PSRAQ_VdqWdq, (RegisterID)dest, (RegisterID)input, (RegisterID)shift);
}

void vmovdqa_rr(XMMRegisterID vn, XMMRegisterID vd)
{
// https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
// VEX.128.66.0F.WIG 6F /r VMOVDQA xmm1, xmm2/m128
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_MOVDQA_VdqWdq, (RegisterID)vd, (RegisterID)vn, (RegisterID)0);
}

void vpmaddubsw_rrr(XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/pmaddubsw
// VEX.128.66.0F38.WIG 04 /r VPMADDUBSW xmm1, xmm2, xmm3/m128
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::ThreeBytesOp38, isW1, OP3_PMADDUBSW_VpdWpd, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
}

void vpsrld_i8rr(uint8_t imm8, XMMRegisterID vn, XMMRegisterID vd)
{
// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
// VEX.128.66.0F.WIG 72 /2 ib VPSRLD xmm1, xmm2, imm8
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::TwoBytesOp, isW1, OP2_PSRLD_UdqIb, (RegisterID)GROUP14_OP_PSRLQ, (RegisterID)vn, (RegisterID)vd);
m_formatter.immediate8(imm8);
}

void vpblendw_i8rrr(uint8_t imm8, XMMRegisterID xmm3, XMMRegisterID xmm2, XMMRegisterID xmm1)
{
// https://www.felixcloutier.com/x86/pblendw
// VEX.128.66.0F3A.WIG 0E /r ib VPBLENDW xmm1, xmm2, xmm3/m128, imm8
bool isVEX256 = false;
bool isW1 = false;
m_formatter.vexThreeByteOp(isVEX256, PRE_SSE_66, VexImpliedBytes::ThreeBytesOp3A, isW1, OP3_PBLENDW, (RegisterID)xmm1, (RegisterID)xmm3, (RegisterID)xmm2);
m_formatter.immediate8(imm8);
}

void movl_rr(RegisterID src, RegisterID dst)
{
m_formatter.oneByteOp(OP_MOV_EvGv, src, dst);
@@ -6019,24 +6063,6 @@ class X86Assembler {
writer.putByteUnchecked((uint8_t)laneIndex);
}

void vexThreeByteOp(bool isVEX256, OneByteOpcodeID simdPrefix, VexImpliedBytes impliedBytes, bool isW1, ThreeByteOpcodeID opcode, uint8_t imm8, RegisterID reg, RegisterID rm, RegisterID vvvv)
{
SingleInstructionBufferWriter writer(m_buffer);
writer.threeBytesVex(isVEX256, simdPrefix, impliedBytes, isW1, reg, (RegisterID)0, rm, vvvv);
writer.putByteUnchecked(opcode);
writer.registerModRM(reg, rm);
writer.putByteUnchecked((uint8_t)imm8 << 4);
}

void vexThreeByteOp(bool isVEX256, OneByteOpcodeID simdPrefix, VexImpliedBytes impliedBytes, bool isW1, TwoByteOpcodeID opcode, uint8_t imm8, RegisterID reg, RegisterID rm, RegisterID vvvv)
{
SingleInstructionBufferWriter writer(m_buffer);
writer.threeBytesVex(isVEX256, simdPrefix, impliedBytes, isW1, reg, (RegisterID)0, rm, vvvv);
writer.putByteUnchecked(opcode);
writer.registerModRM(reg, rm);
writer.putByteUnchecked((uint8_t)imm8 << 4);
}

void vexThreeByteOp(bool isVEX256, OneByteOpcodeID simdPrefix, VexImpliedBytes impliedBytes, bool isW1, ThreeByteOpcodeID opcode, RegisterID reg, RegisterID rm, RegisterID vvvv)
{
SingleInstructionBufferWriter writer(m_buffer);
@@ -1872,9 +1872,15 @@ arm64: VectorBitmask U:G:8, U:F:128, ZD:G:32
x86_64: VectorBitmask U:G:8, U:F:128, ZD:G:32, S:F:128
SIMDInfo, Tmp, Tmp, Tmp

VectorExtaddPairwise U:G:8, U:F:128, D:F:128
arm64: VectorExtaddPairwise U:G:8, U:F:128, D:F:128
SIMDInfo, Tmp, Tmp

x86_64: VectorExtaddPairwise U:G:8, U:F:128, D:F:128, S:G:64, S:F:128
SIMDInfo, Tmp, Tmp, Tmp, Tmp

x86_64: VectorExtaddPairwiseUnsignedInt16 U:F:128, D:F:128, S:F:128, S:F:128
Tmp, Tmp, Tmp, Tmp

arm64: VectorAddPairwise U:G:8, U:F:128, U:F:128, D:F:128
SIMDInfo, Tmp, Tmp, Tmp

@@ -578,6 +578,14 @@ class AirIRGenerator {

result = tmpForType(Types::V128);

if (isX86() && airOp == B3::Air::VectorExtaddPairwise) {
if (info.lane == SIMDLane::i16x8 && info.signMode == SIMDSignMode::Unsigned)
append(VectorExtaddPairwiseUnsignedInt16, v, result, tmpForType(Types::V128), tmpForType(Types::V128));
else
append(airOp, Arg::simdInfo(info), v, result, tmpForType(Types::I64), tmpForType(Types::V128));
return { };
}

if (isX86() && airOp == B3::Air::VectorConvert && info.signMode == SIMDSignMode::Unsigned) {
append(VectorConvertUnsigned, v, result, tmpForType(Types::V128));
return { };
@@ -615,7 +623,6 @@ class AirIRGenerator {
}
return { };
}

}

if (isValidForm(airOp, Arg::Tmp, Arg::Tmp)) {

0 comments on commit ddd1dc1

Please sign in to comment.