-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Ventus][fix]Fix libclc shuffle function #133
Conversation
Passed OPENCL-CTS shuffle_built_in testcase
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
能贴出inline与不inline的汇编代码区别吗? |
以char4为例,shuffle函数代码: inline short __clc_get_el_short4_ushort(short4 x, ushort idx) {
switch (idx){
case 0: return x.s0;
case 1: return x.s1;
case 2: return x.s2;
case 3: return x.s3;
default: return 0; } }
__attribute__((overloadable)) short4 shuffle(short4 x, ushort4 mask){
short4 ret_val;
ret_val.s0 = __clc_get_el_short4_ushort(x, mask.s0);
ret_val.s1 = __clc_get_el_short4_ushort(x, mask.s1);
ret_val.s2 = __clc_get_el_short4_ushort(x, mask.s2);
ret_val.s3 = __clc_get_el_short4_ushort(x, mask.s3);
return ret_val; }
inline版本汇编: .text
.attribute 4, 16
.attribute 5, "rv32i2p0_m2p0_a2p0_zfinx1p0_zdinx1p0_zve32f1p0_zve32x1p0_zvl32b1p0_zhinx1p0"
.file "shufflechar44.cl"
.globl _Z7shuffleDv4_sDv4_t # -- Begin function _Z7shuffleDv4_sDv4_t
.p2align 2
.type _Z7shuffleDv4_sDv4_t,@function
_Z7shuffleDv4_sDv4_t: # @_Z7shuffleDv4_sDv4_t
# %bb.0: # %entry
addi sp, sp, 4
sw ra, -4(sp) # 4-byte Folded Spill
vadd.vx v8, v0, zero
lui t0, 16
addi t0, t0, -1
vand.vx v4, v4, t0
li t1, 1
vmv.v.x v0, t1
# kill: def $v9 killed $x5
.Lpcrel_hi0:
auipc t1, %pcrel_hi(.LBB0_9)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi0)
vblt v0, v4, .LBB0_4
# %bb.1: # %entry
vmv.v.x v0, zero
.Lpcrel_hi1:
auipc t1, %pcrel_hi(.LBB0_9)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi1)
vbeq v4, v0, .LBB0_7
# %bb.2: # %entry
li t0, 1
vmv.v.x v9, t0
vmv.v.x v0, zero
vbne v4, v9, .LBB0_9
vmv.v.x v5, t1
# %bb.3: # %sw.bb1.i
vadd.vx v0, v1, zero
j .LBB0_9
.LBB0_4: # %entry
li t0, 2
vmv.v.x v0, t0
.Lpcrel_hi2:
auipc t1, %pcrel_hi(.LBB0_9)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi2)
vbeq v4, v0, .LBB0_8
# %bb.5: # %entry
li t0, 3
vmv.v.x v9, t0
vmv.v.x v0, zero
vbne v4, v9, .LBB0_9
vmv.v.x v5, t1
# %bb.6: # %sw.bb3.i
vadd.vx v0, v3, zero
j .LBB0_9
.LBB0_7: # %sw.bb.i
vadd.vx v0, v8, zero
j .LBB0_9
.LBB0_8: # %sw.bb2.i
vadd.vx v0, v2, zero
.LBB0_9: # %__clc_get_el_short4_ushort.exit
# Label of block must be emitted
join zero, zero, 0
lui t0, 16
addi t0, t0, -1
vand.vx v4, v5, t0
li t1, 1
# kill: def $v9 killed $x5
.Lpcrel_hi3:
auipc t1, %pcrel_hi(.LBB0_18)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi3)
vblt v5, v4, .LBB0_13
# %bb.10: # %__clc_get_el_short4_ushort.exit
vmv.v.x v5, zero
.Lpcrel_hi4:
auipc t1, %pcrel_hi(.LBB0_18)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi4)
vbeq v4, v5, .LBB0_16
# %bb.11: # %__clc_get_el_short4_ushort.exit
li t0, 1
vmv.v.x v9, t0
vmv.v.x v5, zero
vbne v4, v9, .LBB0_18
vmv.v.x v6, t1
# %bb.12: # %sw.bb1.i15
vadd.vx v5, v1, zero
j .LBB0_18
.LBB0_13: # %__clc_get_el_short4_ushort.exit
li t0, 2
vmv.v.x v5, t0
.Lpcrel_hi5:
auipc t1, %pcrel_hi(.LBB0_18)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi5)
vbeq v4, v5, .LBB0_17
# %bb.14: # %__clc_get_el_short4_ushort.exit
li t0, 3
vmv.v.x v9, t0
vmv.v.x v5, zero
vbne v4, v9, .LBB0_18
vmv.v.x v6, t1
# %bb.15: # %sw.bb3.i17
vadd.vx v5, v3, zero
j .LBB0_18
.LBB0_16: # %sw.bb.i14
vadd.vx v5, v8, zero
j .LBB0_18
.LBB0_17: # %sw.bb2.i16
vadd.vx v5, v2, zero
.LBB0_18: # %__clc_get_el_short4_ushort.exit19
# Label of block must be emitted
join zero, zero, 0
lui t0, 16
addi t0, t0, -1
vand.vx v4, v6, t0
li t1, 1
# kill: def $v9 killed $x5
.Lpcrel_hi6:
auipc t1, %pcrel_hi(.LBB0_27)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi6)
vblt v6, v4, .LBB0_22
# %bb.19: # %__clc_get_el_short4_ushort.exit19
vmv.v.x v6, zero
.Lpcrel_hi7:
auipc t1, %pcrel_hi(.LBB0_27)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi7)
vbeq v4, v6, .LBB0_25
# %bb.20: # %__clc_get_el_short4_ushort.exit19
li t0, 1
vmv.v.x v9, t0
vmv.v.x v6, zero
vbne v4, v9, .LBB0_27
vmv.v.x v7, t1
# %bb.21: # %sw.bb1.i21
vadd.vx v6, v1, zero
j .LBB0_27
.LBB0_22: # %__clc_get_el_short4_ushort.exit19
li t0, 2
vmv.v.x v6, t0
.Lpcrel_hi8:
auipc t1, %pcrel_hi(.LBB0_27)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi8)
vbeq v4, v6, .LBB0_26
# %bb.23: # %__clc_get_el_short4_ushort.exit19
li t0, 3
vmv.v.x v9, t0
vmv.v.x v6, zero
vbne v4, v9, .LBB0_27
vmv.v.x v7, t1
# %bb.24: # %sw.bb3.i23
vadd.vx v6, v3, zero
j .LBB0_27
.LBB0_25: # %sw.bb.i20
vadd.vx v6, v8, zero
j .LBB0_27
.LBB0_26: # %sw.bb2.i22
vadd.vx v6, v2, zero
.LBB0_27: # %__clc_get_el_short4_ushort.exit25
# Label of block must be emitted
join zero, zero, 0
lui t0, 16
addi t0, t0, -1
vand.vx v4, v7, t0
li t1, 1
# kill: def $v9 killed $x5
.Lpcrel_hi9:
auipc t1, %pcrel_hi(.LBB0_35)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi9)
vblt v7, v4, .LBB0_31
# %bb.28: # %__clc_get_el_short4_ushort.exit25
vmv.v.x v2, zero
.Lpcrel_hi10:
auipc t1, %pcrel_hi(.LBB0_35)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi10)
vbeq v4, v2, .LBB0_35
# %bb.29: # %__clc_get_el_short4_ushort.exit25
li t0, 1
vmv.v.x v2, t0
vmv.v.x v8, zero
.Lpcrel_hi11:
auipc t1, %pcrel_hi(.LBB0_35)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi11)
vbne v4, v2, .LBB0_35
# %bb.30: # %sw.bb1.i27
vadd.vx v8, v1, zero
j .LBB0_35
.LBB0_31: # %__clc_get_el_short4_ushort.exit25
li t0, 2
vmv.v.x v1, t0
.Lpcrel_hi12:
auipc t1, %pcrel_hi(.LBB0_35)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi12)
vbeq v4, v1, .LBB0_34
# %bb.32: # %__clc_get_el_short4_ushort.exit25
li t0, 3
vmv.v.x v1, t0
vmv.v.x v8, zero
.Lpcrel_hi13:
auipc t1, %pcrel_hi(.LBB0_35)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi13)
vbne v4, v1, .LBB0_35
# %bb.33: # %sw.bb3.i29
vadd.vx v8, v3, zero
j .LBB0_35
.LBB0_34: # %sw.bb2.i28
vadd.vx v8, v2, zero
.LBB0_35: # %__clc_get_el_short4_ushort.exit31
# Label of block must be emitted
join zero, zero, 0
vadd.vx v1, v5, zero
vadd.vx v2, v6, zero
vadd.vx v3, v8, zero
lw ra, -4(sp) # 4-byte Folded Reload
addi sp, sp, -4
ret
.Lfunc_end0:
.size _Z7shuffleDv4_sDv4_t, .Lfunc_end0-_Z7shuffleDv4_sDv4_t
# -- End function
.ident "clang version 16.0.0 (https://github.com/ziliangzl/llvm-project.git c955d0a29cd08a1c3cdcd728e4d43cf7fc239a64)"
.section ".note.GNU-stack","",@progbits
noinline版本汇编: .text
.attribute 4, 16
.attribute 5, "rv32i2p0_m2p0_a2p0_zfinx1p0_zdinx1p0_zve32f1p0_zve32x1p0_zvl32b1p0_zhinx1p0"
.file "shufflechar44.cl"
.globl __clc_get_el_short4_ushort
.p2align 2
.type __clc_get_el_short4_ushort,@function
__clc_get_el_short4_ushort:
addi sp, sp, 4
sw ra, -4(sp)
li t0, 1
vmv.v.x v5, t0
.Lpcrel_hi0:
auipc t1, %pcrel_hi(.LBB0_8)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi0)
vblt v5, v4, .LBB0_4
vmv.v.x v2, zero
.Lpcrel_hi1:
auipc t1, %pcrel_hi(.LBB0_8)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi1)
vbeq v4, v2, .LBB0_8
li t0, 1
vmv.v.x v2, t0
vmv.v.x v0, zero
.Lpcrel_hi2:
auipc t1, %pcrel_hi(.LBB0_8)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi2)
vbne v4, v2, .LBB0_8
vadd.vx v0, v1, zero
j .LBB0_8
.LBB0_4:
li t0, 2
vmv.v.x v0, t0
.Lpcrel_hi3:
auipc t1, %pcrel_hi(.LBB0_8)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi3)
vbeq v4, v0, .LBB0_7
li t0, 3
vmv.v.x v1, t0
vmv.v.x v0, zero
.Lpcrel_hi4:
auipc t1, %pcrel_hi(.LBB0_8)
setrpc zero, t1, %pcrel_lo(.Lpcrel_hi4)
vbne v4, v1, .LBB0_8
vadd.vx v0, v3, zero
j .LBB0_8
.LBB0_7:
vadd.vx v0, v2, zero
.LBB0_8:
join zero, zero, 0
vsll.vi v0, v0, 16
vsra.vi v0, v0, 16
lw ra, -4(sp)
addi sp, sp, -4
ret
.Lfunc_end0:
.size __clc_get_el_short4_ushort, .Lfunc_end0-__clc_get_el_short4_ushort
.globl _Z7shuffleDv4_sDv4_t
.p2align 2
.type _Z7shuffleDv4_sDv4_t,@function
_Z7shuffleDv4_sDv4_t:
addi sp, sp, 4
addi tp, tp, 36
regext zero, zero, 1
vmv.v.x v32, tp
sw ra, -4(sp)
regext zero, zero, 72
vsw.v v33, -4(v32)
regext zero, zero, 72
vsw.v v34, -8(v32)
regext zero, zero, 72
vsw.v v35, -12(v32)
regext zero, zero, 72
vsw.v v36, -16(v32)
regext zero, zero, 72
vsw.v v37, -20(v32)
regext zero, zero, 72
vsw.v v38, -24(v32)
regext zero, zero, 72
vsw.v v39, -28(v32)
regext zero, zero, 72
vsw.v v40, -32(v32)
regext zero, zero, 72
vsw.v v41, -36(v32)
regext zero, zero, 1
vadd.vx v33, v7, zero
regext zero, zero, 1
vadd.vx v34, v6, zero
regext zero, zero, 1
vadd.vx v35, v5, zero
regext zero, zero, 1
vadd.vx v36, v3, zero
regext zero, zero, 1
vadd.vx v37, v2, zero
regext zero, zero, 1
vadd.vx v38, v1, zero
regext zero, zero, 1
vadd.vx v39, v0, zero
lui t0, 16
addi t0, t0, -1
vand.vx v4, v4, t0
regext zero, zero, 1
vmv.v.x v41, t0
call __clc_get_el_short4_ushort
regext zero, zero, 1
vadd.vx v40, v0, zero
regext zero, zero, 72
vand.vv v4, v35, v41
regext zero, zero, 64
vadd.vx v0, v39, zero
regext zero, zero, 64
vadd.vx v1, v38, zero
regext zero, zero, 64
vadd.vx v2, v37, zero
regext zero, zero, 64
vadd.vx v3, v36, zero
call __clc_get_el_short4_ushort
regext zero, zero, 1
vadd.vx v35, v0, zero
regext zero, zero, 72
vand.vv v4, v34, v41
regext zero, zero, 64
vadd.vx v0, v39, zero
regext zero, zero, 64
vadd.vx v1, v38, zero
regext zero, zero, 64
vadd.vx v2, v37, zero
regext zero, zero, 64
vadd.vx v3, v36, zero
call __clc_get_el_short4_ushort
regext zero, zero, 1
vadd.vx v34, v0, zero
regext zero, zero, 72
vand.vv v4, v33, v41
regext zero, zero, 64
vadd.vx v0, v39, zero
regext zero, zero, 64
vadd.vx v1, v38, zero
regext zero, zero, 64
vadd.vx v2, v37, zero
regext zero, zero, 64
vadd.vx v3, v36, zero
call __clc_get_el_short4_ushort
vadd.vx v3, v0, zero
regext zero, zero, 64
vadd.vx v0, v40, zero
regext zero, zero, 64
vadd.vx v1, v35, zero
regext zero, zero, 64
vadd.vx v2, v34, zero
lw ra, -4(sp)
regext zero, zero, 9
vlw.v v33, -4(v32)
regext zero, zero, 9
vlw.v v34, -8(v32)
regext zero, zero, 9
vlw.v v35, -12(v32)
regext zero, zero, 9
vlw.v v36, -16(v32)
regext zero, zero, 9
vlw.v v37, -20(v32)
regext zero, zero, 9
vlw.v v38, -24(v32)
regext zero, zero, 9
vlw.v v39, -28(v32)
regext zero, zero, 9
vlw.v v40, -32(v32)
regext zero, zero, 9
vlw.v v41, -36(v32)
addi sp, sp, -4
addi tp, tp, -36
regext zero, zero, 1
vmv.v.x v32, tp
ret
.Lfunc_end1:
.size _Z7shuffleDv4_sDv4_t, .Lfunc_end1-_Z7shuffleDv4_sDv4_t
.ident "clang version 16.0.0 (https://github.com/ziliangzl/llvm-project.git c955d0a29cd08a1c3cdcd728e4d43cf7fc239a64)"
.section ".note.GNU-stack","",@progbits
.addrsig
|
这个bug在vec4长度以上类型才会触发,所以汇编代码比较长 |
是栈的问题吗@ziliangzl |
是inline优化后有缺失指令导致的错误 |
Passed OPENCL-CTS shuffle_built_in testcase