Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Ventus][fix]Fix libclc shuffle function #133

Merged
merged 1 commit into from
Jun 27, 2024

Conversation

ziliangzl
Copy link
Collaborator

Passed OPENCL-CTS shuffle_built_in testcase

Passed OPENCL-CTS shuffle_built_in testcase
Copy link
Collaborator

@wangqinfan wangqinfan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@zhoujingya
Copy link
Collaborator

能贴出inline与不inline的汇编代码区别吗?

@ziliangzl
Copy link
Collaborator Author

以char4为例,shuffle函数代码:

inline short __clc_get_el_short4_ushort(short4 x, ushort idx) {
  switch (idx){
    case 0: return x.s0;
    case 1: return x.s1;
    case 2: return x.s2;
    case 3: return x.s3;
    default: return 0; } }

__attribute__((overloadable)) short4 shuffle(short4 x, ushort4 mask){
  short4 ret_val;
  ret_val.s0 = __clc_get_el_short4_ushort(x, mask.s0);
  ret_val.s1 = __clc_get_el_short4_ushort(x, mask.s1);
  ret_val.s2 = __clc_get_el_short4_ushort(x, mask.s2);
  ret_val.s3 = __clc_get_el_short4_ushort(x, mask.s3);
  return ret_val; }

inline版本汇编:

	.text
	.attribute	4, 16
	.attribute	5, "rv32i2p0_m2p0_a2p0_zfinx1p0_zdinx1p0_zve32f1p0_zve32x1p0_zvl32b1p0_zhinx1p0"
	.file	"shufflechar44.cl"
	.globl	_Z7shuffleDv4_sDv4_t            # -- Begin function _Z7shuffleDv4_sDv4_t
	.p2align	2
	.type	_Z7shuffleDv4_sDv4_t,@function
_Z7shuffleDv4_sDv4_t:                   # @_Z7shuffleDv4_sDv4_t
# %bb.0:                                # %entry
	addi	sp, sp, 4
	sw	ra, -4(sp)                      # 4-byte Folded Spill
	vadd.vx	v8, v0, zero
	lui	t0, 16
	addi	t0, t0, -1
	vand.vx	v4, v4, t0
	li	t1, 1
	vmv.v.x	v0, t1
                                        # kill: def $v9 killed $x5
.Lpcrel_hi0:
	auipc	t1, %pcrel_hi(.LBB0_9)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi0)
	vblt	v0, v4, .LBB0_4
# %bb.1:                                # %entry
	vmv.v.x	v0, zero
.Lpcrel_hi1:
	auipc	t1, %pcrel_hi(.LBB0_9)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi1)
	vbeq	v4, v0, .LBB0_7
# %bb.2:                                # %entry
	li	t0, 1
	vmv.v.x	v9, t0
	vmv.v.x	v0, zero
	vbne	v4, v9, .LBB0_9
	vmv.v.x	v5, t1
# %bb.3:                                # %sw.bb1.i
	vadd.vx	v0, v1, zero
	j	.LBB0_9
.LBB0_4:                                # %entry
	li	t0, 2
	vmv.v.x	v0, t0
.Lpcrel_hi2:
	auipc	t1, %pcrel_hi(.LBB0_9)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi2)
	vbeq	v4, v0, .LBB0_8
# %bb.5:                                # %entry
	li	t0, 3
	vmv.v.x	v9, t0
	vmv.v.x	v0, zero
	vbne	v4, v9, .LBB0_9
	vmv.v.x	v5, t1
# %bb.6:                                # %sw.bb3.i
	vadd.vx	v0, v3, zero
	j	.LBB0_9
.LBB0_7:                                # %sw.bb.i
	vadd.vx	v0, v8, zero
	j	.LBB0_9
.LBB0_8:                                # %sw.bb2.i
	vadd.vx	v0, v2, zero
.LBB0_9:                                # %__clc_get_el_short4_ushort.exit
                                        # Label of block must be emitted
	join	zero, zero, 0
	lui	t0, 16
	addi	t0, t0, -1
	vand.vx	v4, v5, t0
	li	t1, 1
                                        # kill: def $v9 killed $x5
.Lpcrel_hi3:
	auipc	t1, %pcrel_hi(.LBB0_18)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi3)
	vblt	v5, v4, .LBB0_13
# %bb.10:                               # %__clc_get_el_short4_ushort.exit
	vmv.v.x	v5, zero
.Lpcrel_hi4:
	auipc	t1, %pcrel_hi(.LBB0_18)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi4)
	vbeq	v4, v5, .LBB0_16
# %bb.11:                               # %__clc_get_el_short4_ushort.exit
	li	t0, 1
	vmv.v.x	v9, t0
	vmv.v.x	v5, zero
	vbne	v4, v9, .LBB0_18
	vmv.v.x	v6, t1
# %bb.12:                               # %sw.bb1.i15
	vadd.vx	v5, v1, zero
	j	.LBB0_18
.LBB0_13:                               # %__clc_get_el_short4_ushort.exit
	li	t0, 2
	vmv.v.x	v5, t0
.Lpcrel_hi5:
	auipc	t1, %pcrel_hi(.LBB0_18)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi5)
	vbeq	v4, v5, .LBB0_17
# %bb.14:                               # %__clc_get_el_short4_ushort.exit
	li	t0, 3
	vmv.v.x	v9, t0
	vmv.v.x	v5, zero
	vbne	v4, v9, .LBB0_18
	vmv.v.x	v6, t1
# %bb.15:                               # %sw.bb3.i17
	vadd.vx	v5, v3, zero
	j	.LBB0_18
.LBB0_16:                               # %sw.bb.i14
	vadd.vx	v5, v8, zero
	j	.LBB0_18
.LBB0_17:                               # %sw.bb2.i16
	vadd.vx	v5, v2, zero
.LBB0_18:                               # %__clc_get_el_short4_ushort.exit19
                                        # Label of block must be emitted
	join	zero, zero, 0
	lui	t0, 16
	addi	t0, t0, -1
	vand.vx	v4, v6, t0
	li	t1, 1
                                        # kill: def $v9 killed $x5
.Lpcrel_hi6:
	auipc	t1, %pcrel_hi(.LBB0_27)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi6)
	vblt	v6, v4, .LBB0_22
# %bb.19:                               # %__clc_get_el_short4_ushort.exit19
	vmv.v.x	v6, zero
.Lpcrel_hi7:
	auipc	t1, %pcrel_hi(.LBB0_27)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi7)
	vbeq	v4, v6, .LBB0_25
# %bb.20:                               # %__clc_get_el_short4_ushort.exit19
	li	t0, 1
	vmv.v.x	v9, t0
	vmv.v.x	v6, zero
	vbne	v4, v9, .LBB0_27
	vmv.v.x	v7, t1
# %bb.21:                               # %sw.bb1.i21
	vadd.vx	v6, v1, zero
	j	.LBB0_27
.LBB0_22:                               # %__clc_get_el_short4_ushort.exit19
	li	t0, 2
	vmv.v.x	v6, t0
.Lpcrel_hi8:
	auipc	t1, %pcrel_hi(.LBB0_27)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi8)
	vbeq	v4, v6, .LBB0_26
# %bb.23:                               # %__clc_get_el_short4_ushort.exit19
	li	t0, 3
	vmv.v.x	v9, t0
	vmv.v.x	v6, zero
	vbne	v4, v9, .LBB0_27
	vmv.v.x	v7, t1
# %bb.24:                               # %sw.bb3.i23
	vadd.vx	v6, v3, zero
	j	.LBB0_27
.LBB0_25:                               # %sw.bb.i20
	vadd.vx	v6, v8, zero
	j	.LBB0_27
.LBB0_26:                               # %sw.bb2.i22
	vadd.vx	v6, v2, zero
.LBB0_27:                               # %__clc_get_el_short4_ushort.exit25
                                        # Label of block must be emitted
	join	zero, zero, 0
	lui	t0, 16
	addi	t0, t0, -1
	vand.vx	v4, v7, t0
	li	t1, 1
                                        # kill: def $v9 killed $x5
.Lpcrel_hi9:
	auipc	t1, %pcrel_hi(.LBB0_35)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi9)
	vblt	v7, v4, .LBB0_31
# %bb.28:                               # %__clc_get_el_short4_ushort.exit25
	vmv.v.x	v2, zero
.Lpcrel_hi10:
	auipc	t1, %pcrel_hi(.LBB0_35)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi10)
	vbeq	v4, v2, .LBB0_35
# %bb.29:                               # %__clc_get_el_short4_ushort.exit25
	li	t0, 1
	vmv.v.x	v2, t0
	vmv.v.x	v8, zero
.Lpcrel_hi11:
	auipc	t1, %pcrel_hi(.LBB0_35)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi11)
	vbne	v4, v2, .LBB0_35
# %bb.30:                               # %sw.bb1.i27
	vadd.vx	v8, v1, zero
	j	.LBB0_35
.LBB0_31:                               # %__clc_get_el_short4_ushort.exit25
	li	t0, 2
	vmv.v.x	v1, t0
.Lpcrel_hi12:
	auipc	t1, %pcrel_hi(.LBB0_35)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi12)
	vbeq	v4, v1, .LBB0_34
# %bb.32:                               # %__clc_get_el_short4_ushort.exit25
	li	t0, 3
	vmv.v.x	v1, t0
	vmv.v.x	v8, zero
.Lpcrel_hi13:
	auipc	t1, %pcrel_hi(.LBB0_35)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi13)
	vbne	v4, v1, .LBB0_35
# %bb.33:                               # %sw.bb3.i29
	vadd.vx	v8, v3, zero
	j	.LBB0_35
.LBB0_34:                               # %sw.bb2.i28
	vadd.vx	v8, v2, zero
.LBB0_35:                               # %__clc_get_el_short4_ushort.exit31
                                        # Label of block must be emitted
	join	zero, zero, 0
	vadd.vx	v1, v5, zero
	vadd.vx	v2, v6, zero
	vadd.vx	v3, v8, zero
	lw	ra, -4(sp)                      # 4-byte Folded Reload
	addi	sp, sp, -4
	ret
.Lfunc_end0:
	.size	_Z7shuffleDv4_sDv4_t, .Lfunc_end0-_Z7shuffleDv4_sDv4_t
                                        # -- End function
	.ident	"clang version 16.0.0 (https://github.com/ziliangzl/llvm-project.git c955d0a29cd08a1c3cdcd728e4d43cf7fc239a64)"
	.section	".note.GNU-stack","",@progbits

noinline版本汇编:

	.text
	.attribute	4, 16
	.attribute	5, "rv32i2p0_m2p0_a2p0_zfinx1p0_zdinx1p0_zve32f1p0_zve32x1p0_zvl32b1p0_zhinx1p0"
	.file	"shufflechar44.cl"
	.globl	__clc_get_el_short4_ushort
	.p2align	2
	.type	__clc_get_el_short4_ushort,@function
__clc_get_el_short4_ushort:
	addi	sp, sp, 4
	sw	ra, -4(sp)
	li	t0, 1
	vmv.v.x	v5, t0
.Lpcrel_hi0:
	auipc	t1, %pcrel_hi(.LBB0_8)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi0)
	vblt	v5, v4, .LBB0_4
	vmv.v.x	v2, zero
.Lpcrel_hi1:
	auipc	t1, %pcrel_hi(.LBB0_8)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi1)
	vbeq	v4, v2, .LBB0_8
	li	t0, 1
	vmv.v.x	v2, t0
	vmv.v.x	v0, zero
.Lpcrel_hi2:
	auipc	t1, %pcrel_hi(.LBB0_8)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi2)
	vbne	v4, v2, .LBB0_8
	vadd.vx	v0, v1, zero
	j	.LBB0_8
.LBB0_4:
	li	t0, 2
	vmv.v.x	v0, t0
.Lpcrel_hi3:
	auipc	t1, %pcrel_hi(.LBB0_8)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi3)
	vbeq	v4, v0, .LBB0_7
	li	t0, 3
	vmv.v.x	v1, t0
	vmv.v.x	v0, zero
.Lpcrel_hi4:
	auipc	t1, %pcrel_hi(.LBB0_8)
	setrpc	zero, t1, %pcrel_lo(.Lpcrel_hi4)
	vbne	v4, v1, .LBB0_8
	vadd.vx	v0, v3, zero
	j	.LBB0_8
.LBB0_7:
	vadd.vx	v0, v2, zero
.LBB0_8:
	join	zero, zero, 0
	vsll.vi	v0, v0, 16
	vsra.vi	v0, v0, 16
	lw	ra, -4(sp)
	addi	sp, sp, -4
	ret
.Lfunc_end0:
	.size	__clc_get_el_short4_ushort, .Lfunc_end0-__clc_get_el_short4_ushort

	.globl	_Z7shuffleDv4_sDv4_t
	.p2align	2
	.type	_Z7shuffleDv4_sDv4_t,@function
_Z7shuffleDv4_sDv4_t:
	addi	sp, sp, 4
	addi	tp, tp, 36
	regext	zero, zero, 1
	vmv.v.x	v32, tp
	sw	ra, -4(sp)
	regext	zero, zero, 72
	vsw.v	v33, -4(v32)
	regext	zero, zero, 72
	vsw.v	v34, -8(v32)
	regext	zero, zero, 72
	vsw.v	v35, -12(v32)
	regext	zero, zero, 72
	vsw.v	v36, -16(v32)
	regext	zero, zero, 72
	vsw.v	v37, -20(v32)
	regext	zero, zero, 72
	vsw.v	v38, -24(v32)
	regext	zero, zero, 72
	vsw.v	v39, -28(v32)
	regext	zero, zero, 72
	vsw.v	v40, -32(v32)
	regext	zero, zero, 72
	vsw.v	v41, -36(v32)
	regext	zero, zero, 1
	vadd.vx	v33, v7, zero
	regext	zero, zero, 1
	vadd.vx	v34, v6, zero
	regext	zero, zero, 1
	vadd.vx	v35, v5, zero
	regext	zero, zero, 1
	vadd.vx	v36, v3, zero
	regext	zero, zero, 1
	vadd.vx	v37, v2, zero
	regext	zero, zero, 1
	vadd.vx	v38, v1, zero
	regext	zero, zero, 1
	vadd.vx	v39, v0, zero
	lui	t0, 16
	addi	t0, t0, -1
	vand.vx	v4, v4, t0
	regext	zero, zero, 1
	vmv.v.x	v41, t0
	call	__clc_get_el_short4_ushort
	regext	zero, zero, 1
	vadd.vx	v40, v0, zero
	regext	zero, zero, 72
	vand.vv	v4, v35, v41
	regext	zero, zero, 64
	vadd.vx	v0, v39, zero
	regext	zero, zero, 64
	vadd.vx	v1, v38, zero
	regext	zero, zero, 64
	vadd.vx	v2, v37, zero
	regext	zero, zero, 64
	vadd.vx	v3, v36, zero
	call	__clc_get_el_short4_ushort
	regext	zero, zero, 1
	vadd.vx	v35, v0, zero
	regext	zero, zero, 72
	vand.vv	v4, v34, v41
	regext	zero, zero, 64
	vadd.vx	v0, v39, zero
	regext	zero, zero, 64
	vadd.vx	v1, v38, zero
	regext	zero, zero, 64
	vadd.vx	v2, v37, zero
	regext	zero, zero, 64
	vadd.vx	v3, v36, zero
	call	__clc_get_el_short4_ushort
	regext	zero, zero, 1
	vadd.vx	v34, v0, zero
	regext	zero, zero, 72
	vand.vv	v4, v33, v41
	regext	zero, zero, 64
	vadd.vx	v0, v39, zero
	regext	zero, zero, 64
	vadd.vx	v1, v38, zero
	regext	zero, zero, 64
	vadd.vx	v2, v37, zero
	regext	zero, zero, 64
	vadd.vx	v3, v36, zero
	call	__clc_get_el_short4_ushort
	vadd.vx	v3, v0, zero
	regext	zero, zero, 64
	vadd.vx	v0, v40, zero
	regext	zero, zero, 64
	vadd.vx	v1, v35, zero
	regext	zero, zero, 64
	vadd.vx	v2, v34, zero
	lw	ra, -4(sp)
	regext	zero, zero, 9
	vlw.v	v33, -4(v32)
	regext	zero, zero, 9
	vlw.v	v34, -8(v32)
	regext	zero, zero, 9
	vlw.v	v35, -12(v32)
	regext	zero, zero, 9
	vlw.v	v36, -16(v32)
	regext	zero, zero, 9
	vlw.v	v37, -20(v32)
	regext	zero, zero, 9
	vlw.v	v38, -24(v32)
	regext	zero, zero, 9
	vlw.v	v39, -28(v32)
	regext	zero, zero, 9
	vlw.v	v40, -32(v32)
	regext	zero, zero, 9
	vlw.v	v41, -36(v32)
	addi	sp, sp, -4
	addi	tp, tp, -36
	regext	zero, zero, 1
	vmv.v.x	v32, tp
	ret
.Lfunc_end1:
	.size	_Z7shuffleDv4_sDv4_t, .Lfunc_end1-_Z7shuffleDv4_sDv4_t

	.ident	"clang version 16.0.0 (https://github.com/ziliangzl/llvm-project.git c955d0a29cd08a1c3cdcd728e4d43cf7fc239a64)"
	.section	".note.GNU-stack","",@progbits
	.addrsig

@ziliangzl
Copy link
Collaborator Author

能贴出inline与不inline的汇编代码区别吗?

这个bug在vec4长度以上类型才会触发,所以汇编代码比较长

@ziliangzl ziliangzl merged commit d704c61 into THU-DSP-LAB:main Jun 27, 2024
2 checks passed
@zhoujingya
Copy link
Collaborator

能贴出inline与不inline的汇编代码区别吗?

这个bug在vec4长度以上类型才会触发,所以汇编代码比较长

是栈的问题吗@ziliangzl

@ziliangzl
Copy link
Collaborator Author

能贴出inline与不inline的汇编代码区别吗?

这个bug在vec4长度以上类型才会触发,所以汇编代码比较长

是栈的问题吗@ziliangzl

是inline优化后有缺失指令导致的错误

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants